In [2]:
import pandas as pd
import numpy as np

df_mai = pd.read_csv('VF_mai_counties_Q222.csv')
df_indcom = pd.read_csv('VF_indcom_counties_Q222.csv')
df_census = pd.read_csv('census.csv')

In [3]:
def preprocess(df, value_name, split_str):
    df = df.melt(id_vars='cfips', var_name='month', value_name=value_name)
    df.month = df.month.apply(lambda x: x.split(split_str, 1)[1])
    df['month'] = pd.to_datetime(df['month'], format='%b%y').dt.to_period('M').apply(lambda x: x.to_timestamp())
    df['row_id'] = df.cfips.astype(str) + '_' + df['month'].dt.strftime('%Y-%m-%d')
    df = df.drop_duplicates()
    return df

In [4]:
df_indcom = df_indcom[~df_indcom.cfips.isna()]
df_indcom = df_indcom.drop_duplicates(subset='cfips')
df_indcom.cfips = df_indcom.cfips.astype(int)
df_base = pd.concat([df_indcom.iloc[:, 0:5], df_indcom.iloc[:, -1]], axis=1)

df_order = pd.concat([df_indcom.iloc[:, 0], df_indcom.iloc[:, 5:40]], axis=1)
df_order = preprocess(df_order, 'orders_rank', 'orders_rank_')
df_merch = pd.concat([df_indcom.iloc[:, 0], df_indcom.iloc[:, 40:75]], axis=1)
df_merch = preprocess(df_merch, 'merchants_rank', 'merchants_rank_')
df_gmv = pd.concat([df_indcom.iloc[:, 0], df_indcom.iloc[:, 75:110]], axis=1)
df_gmv = preprocess(df_gmv, 'gmv_rank', 'gmv_rank_')
df_avg_tr = pd.concat([df_indcom.iloc[:, 0], df_indcom.iloc[:, 145:180]], axis=1)
df_avg_tr = preprocess(df_avg_tr, 'avg_traffic', 'avg_traffic_')

In [5]:
df_merged = df_order
for i in [df_merch, df_gmv, df_avg_tr]:
    df_merged = pd.merge(df_merged, i, on=['cfips', 'month', 'row_id'], how='inner')

df_merged = pd.merge(df_merged, df_base, on='cfips')

In [6]:
df_encoded = pd.get_dummies(df_merged['groupflag'])
df_merged = df_merged.drop(columns='groupflag')
df_merged.month = pd.to_datetime(df_merged.month, format='%b%y')
df = pd.concat([df_merged, df_encoded], axis=1)

In [8]:
df = df.drop(columns=['county', 'state'])

In [9]:
def normalized_rank_change(df, column, n):
    col = df.groupby('cfips').apply(lambda x: (x[column] - x[column].shift()) / n).reset_index(drop=True)
    col = col.fillna(0)
    return col
n = df.cfips.nunique()

df['nrc_order'] = normalized_rank_change(df, 'orders_rank', n)
df['nrc_merch'] = normalized_rank_change(df, 'merchants_rank', n)
df['nrc_gmv'] = normalized_rank_change(df, 'gmv_rank', n)

In [11]:
df.to_csv('df_cleaned.csv')