In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
import lightgbm as lgb

train_df = pd.read_csv('train.csv')
census = pd.read_csv('census.csv', index_col='cfips')
df_cleaned = pd.read_csv('df_cleaned.csv')
df = pd.merge(train_df, df_cleaned, on=['cfips', 'row_id'])

In [35]:
df = df.drop('Unnamed: 0', axis=1)
df = df.rename(columns={'< 30k pop': 'l_30k', '> 225k pop': 'g_225k'})

In [36]:
state_dict = df[['cfips', 'state', 'county']]
state_dict = state_dict.set_index('cfips')
state_dict = state_dict.drop_duplicates()
state_dict = state_dict.to_dict()

zero_mask = (df['microbusiness_density'] == 0) & (df['active'] == 0) 
df['ratio'] = df.loc[~ zero_mask].groupby('cfips', group_keys=False).apply(lambda x: x.active / x.microbusiness_density)
df['ratio'] = df['ratio'].fillna(0)
df.state = df['state'].astype('category').cat.codes
df = df.drop(columns=['county'])

In [37]:
df.first_day_of_month = pd.to_datetime(df.first_day_of_month)
df['month'] = df.first_day_of_month.dt.month
df['year'] = df.first_day_of_month.dt.year - 2019
df['is_new_year'] = 0
df.loc[df.month == 1, 'is_new_year'] = 1
seasons = {1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring', 6: 'Summer', 7: 'Summer', 8: 'Summer', 9: 'Fall', 10: 'Fall', 11: 'Fall', 12: 'Winter'}
df['season'] = df['month'].apply(lambda x: seasons[x])
df['md_national_avg'] = df.groupby(['year','month'])['microbusiness_density'].transform('mean')
df['md_state_avg'] = df.groupby(['state','year','month'])['microbusiness_density'].transform('mean')
df['a_national_avg'] = df.groupby(['year','month'])['active'].transform('mean')
df['a_state_avg'] = df.groupby(['state','year','month'])['active'].transform('mean')


In [38]:
df = pd.concat([df, pd.get_dummies(df['season'], prefix='season')], axis=1)
df = df.drop(columns='season')

In [39]:
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay

# Create a US holiday calendar
cal = USFederalHolidayCalendar()

holidays = cal.holidays(start='2019-01-01', end='2023-06-01')
df['holidays'] = pd.to_datetime(df['first_day_of_month'], format='%Y-%m-%d').dt.to_period('M').apply(lambda x: len(holidays[holidays.to_period('M') == x]))

In [40]:
df['idx'] = df.groupby('cfips')['cfips'].cumcount()
df = df.merge(census, on='cfips')

In [41]:
import numpy as np

def calculate_log_diff(df, groupby_col, column, mask, new_col_name):
    df[new_col_name] = df.loc[~mask].groupby(groupby_col, group_keys=False)[column].apply(lambda x: np.log(x).diff())
    df.loc[mask, [new_col_name]] = 0
    df[new_col_name] = df.loc[~mask].groupby(groupby_col, group_keys=False)[new_col_name].bfill()
    return df

def create_lags(df, groupby_col, columns, lags):
    for column in columns:
        for lag in lags:
            new_col_name = f'{column}_lag_{lag}'
            df[new_col_name] = df.groupby(groupby_col, group_keys=False)[column].shift(lag)
    return df

def create_target(df, groupby_col, column, mask, new_col_name):
    df[new_col_name] = df.loc[~mask].groupby(groupby_col, group_keys=False)[column].apply(lambda x: np.log(x.shift(-1)) - np.log(x))
    df.loc[mask, [new_col_name]] = 0
    return df

def fill_target_with_train(df_test, train_df):
    df_test = df_test.merge(train_df[['target1']], how='left', left_index=True, right_index=True)
    df_test['target'] = df_test['target'].fillna(df_test['target1'])
    df_test = df_test.drop(columns='target1')
    return df_test

# Preprocessing
df = df.set_index('row_id')
mask = ((df.microbusiness_density == 0) & (df.active == 0))

# Calculate log differences
df = calculate_log_diff(df, 'cfips', 'md_state_avg', mask, 'md_state_avg_log_diff')
df = calculate_log_diff(df, 'cfips', 'a_state_avg', mask, 'a_state_avg_log_diff')

# Create lags
LAGS = [1, 2, 3, 4, 5]
columns_to_lag = ['microbusiness_density', 'active']
df = create_lags(df, 'cfips', columns_to_lag, LAGS)

# Create target
df = create_target(df, 'cfips', 'microbusiness_density', mask, 'target')

In [42]:
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

def get_NN_cfips_by_feats(df, N, p, metric, feature):
    """
    Get nearest neighbors based on a specific feature.
    """
    df = df.reset_index()
    df = df[['cfips', 'first_day_of_month', feature]]
    df = df.pivot(index='cfips', columns='first_day_of_month', values=feature)
    df.columns.name = None
    df = df.fillna(0)
    nn = NearestNeighbors(n_neighbors=N, p=p, metric=metric)
    nn.fit(df)
    neighbors = nn.kneighbors(df, return_distance=False)
    df_nn = pd.DataFrame(neighbors, index=df.index)
    df_nn = df_nn.apply(lambda col: df.index[col])
    df_nn = df_nn.iloc[:, 1:]
    return df_nn

def get_NN_cfips_by_census(census, N, p, metric):
    """
    Get nearest neighbors based on census data.
    """
    nn_census = NearestNeighbors(n_neighbors=N, p=p, metric=metric)
    census_na = census[census.isna().any(axis=1)]
    na_mask = census_na.index
    census.loc[na_mask] = census.loc[na_mask].fillna(method='ffill', axis=1)
    df_mask = df.cfips.unique()
    census = census.loc[df_mask]
    nn_census.fit(census)
    neighbors = nn_census.kneighbors(census, return_distance=False)
    df_nn = pd.DataFrame(neighbors, index=census.index)
    df_nn = df_nn.iloc[:, 1:]
    df_nn = df_nn.loc[df_mask]
    df_nn = df_nn.rename(columns={1: 'NN_1', 2: 'NN_2', 3: 'NN_3'})
    df_nn[['NN_1', 'NN_2', 'NN_3']] = df_nn[['NN_1', 'NN_2', 'NN_3']].applymap(lambda x: census.index[x])
    return df_nn

def generate_feature(df_nn, df, feature, colname):
    """
    Generate feature for nearest neighbors.
    """
    cfips = df.cfips.unique()
    df_grouped = df.groupby('cfips')
    dfs = []
    for cfips_val in tqdm(cfips):
        nn_list = df_nn.loc[cfips_val].values.tolist()
        feats = pd.DataFrame(np.asarray([df_grouped.get_group(i)[feature].values for i in nn_list]).T, columns=[f'{colname}_NN_{i}' for i in range(len(nn_list))])
        dfs.append(feats)
    result = pd.concat(dfs, axis=0)
    result = result.reset_index(drop=True)
    return result

def combine_nn_feats(df, df_nn):
    """
    Combine features with the original dataframe.
    """
    df = df.reset_index()
    df = pd.concat([df, df_nn], axis=1)
    df = df.drop(columns='first_day_of_month')
    df = df.set_index('row_id')
    return df


def generate_lagged_features(df_nn, df, feature, prefix):
    feature_dfs = []
    for lag in range(1, 6):  # Generate lags up to 5
        colname = f"{prefix}_lag_{lag}"
        feature_df = generate_feature(df_nn, df, f"{feature}_lag_{lag}", colname)
        feature_dfs.append(feature_df)
    return feature_dfs

KNN = True

if KNN:
    # generate neighbor
    df_nn_c = get_NN_cfips_by_census(census, N=4, p=1, metric='manhattan')
    df_nn_md = get_NN_cfips_by_feats(df, N=4, p=1, metric='manhattan', feature='microbusiness_density')
    df_nn_md_log_diff = get_NN_cfips_by_feats(df, N=4, p=1, metric='manhattan', feature='target')

    # generate features
    md_state_avg_diff_features = [
        generate_feature(df_nn_c, df, 'md_state_avg_log_diff', 'md_state_avg_c'),
        generate_feature(df_nn_md, df, 'md_state_avg_log_diff', 'md_state_avg_1'),
        generate_feature(df_nn_md_log_diff, df, 'md_state_avg_log_diff', 'md_state_avg_ld'),
    ]

    md_lag_features_c = generate_lagged_features(df_nn_c, df, 'microbusiness_density', 'md_c')
    md_lag_features_md = generate_lagged_features(df_nn_md, df, 'microbusiness_density', 'md_1')
    md_lag_features_md_log_diff = generate_lagged_features(df_nn_md_log_diff, df, 'microbusiness_density', 'md_ld')

    # Combine generated features
    NN_feats_list = md_state_avg_diff_features + md_lag_features_c + md_lag_features_md + md_lag_features_md_log_diff
    NN_feats = pd.concat(NN_feats_list, axis=1)
    
    # Merge the features with the training dataset
    df = combine_nn_feats(df, NN_feats)

else:
    df = df.drop(columns='first_day_of_month')


100%|██████████| 3085/3085 [00:01<00:00, 2015.50it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2102.78it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2189.40it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2217.10it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2135.40it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2120.26it/s]
100%|██████████| 3085/3085 [00:01<00:00, 1756.62it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2227.92it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2216.97it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2155.69it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2068.40it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2417.59it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2480.19it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2249.60it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2143.57it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2126.84it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2253.34it/s]
100%|██████████| 3085/3085 [00:01<00:00, 2074.81it/s]


In [43]:
# Split into train and test
mask = ((train_df.microbusiness_density == 0) & (train_df.active == 0))
train_df = create_target(train_df, 'cfips', 'microbusiness_density', mask, 'target1')
train_df = train_df.set_index('row_id')
df_train = df[~df.target.isna()]
df_test = df[df.target.isna()]


df_test = fill_target_with_train(df_test, train_df)

df_train = df_train.fillna(0)
target_train = df_train.target
df_train.drop(columns='target', inplace=True)

df_test = df_test.fillna(0)
target_test = df_test.target
df_test.drop(columns='target', inplace=True)

df_train.to_csv('df_train.csv')
df_test.to_csv('df_test.csv')
target_train.to_csv('target_train.csv')
target_test.to_csv('target_test.csv')