In [154]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

train_df = pd.read_csv('train.csv')
census = pd.read_csv('census.csv', index_col='cfips')
df_cleaned = pd.read_csv('df_cleaned.csv')
df = pd.merge(train_df, df_cleaned, on=['cfips', 'row_id'])
df = df.drop('Unnamed: 0', axis=1)
df = df.rename(columns={'< 30k pop': 'l_30k', '> 225k pop': 'g_225k'})

In [155]:
df.columns

Index(['row_id', 'cfips', 'county', 'state', 'first_day_of_month',
       'microbusiness_density', 'active', 'orders_rank', 'merchants_rank',
       'gmv_rank', 'avg_traffic', 'avg_lifespan_mths', '30k - 225k pop',
       'l_30k', 'g_225k', 'nrc_order', 'nrc_merch', 'nrc_gmv', 'confirmed',
       'deaths', 'people_vaccinated', 'people_fully_vaccinated',
       'school_closing', 'workplace_closing', 'cancel_events',
       'gatherings_restrictions', 'transport_closing',
       'stay_home_restrictions', 'internal_movement_restrictions',
       'international_movement_restrictions', 'information_campaigns',
       'testing_policy', 'contact_tracing', 'facial_coverings',
       'vaccination_policy', 'elderly_people_protection',
       'government_response_index', 'stringency_index',
       'containment_health_index', 'economic_support_index',
       'ESTIMATESBASE2020', 'POPESTIMATE2020', 'POPESTIMATE2021',
       'NPOPCHG2020', 'NPOPCHG2021', 'BIRTHS2020', 'BIRTHS2021', 'DEATHS2020',
    

In [156]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

def sliding_window_mean(df, feature, start, end):
    return df[feature].values[start:end + 1].mean()

def calculate_sliding_window_means(df, features, seq_len):
    means = {feature: [] for feature in features}
    for sub_seq_start in range(len(df) - seq_len + 1):
        sub_seq_end = sub_seq_start + seq_len - 1
        for feature in features:
            mean = sliding_window_mean(df, feature, sub_seq_start, sub_seq_end)
            means[feature].append(mean)
    return means

seq_len = 25

# Specify the features for which you want to calculate the sliding window mean
features_to_mean = ['microbusiness_density','active', 'confirmed', 'deaths', 'DGS10_last', 'total_pop']

train_seq_all = []
for cfips_value in tqdm(df.cfips.unique()):
    indices = (df['cfips'] == cfips_value)
    tmp = df.loc[indices].copy().reset_index(drop=True)

    sub_seq_all = []
    for sub_seq_start in range(len(tmp) - seq_len + 1):
        sub_seq_end = sub_seq_start + seq_len - 1
        sub_seq = tmp.loc[sub_seq_start:sub_seq_end, 'microbusiness_density'].reset_index(drop=True)
        sub_seq_all.append(sub_seq)
    sub_seq_all = pd.concat(sub_seq_all, axis=1).transpose()

    sub_seq_all['cfips'] = tmp.cfips.values[0]
    sub_seq_all['state'] = tmp.state.values[0]

    # Calculate the sliding window mean for each subsequence and feature
    means = calculate_sliding_window_means(tmp, features_to_mean, seq_len)
    for feature in features_to_mean:
        sub_seq_all[f"{feature}_average"] = means[feature]

    # Fit the scaler on your training data (excluding 'cfips' and 'state' columns)
    feature_average_columns = ["confirmed_average", "deaths_average"]
    feature_averages = sub_seq_all[feature_average_columns]
    scaler.fit(feature_averages)

    # Transform the feature averages
    feature_averages_scaled = scaler.transform(feature_averages)

    # Replace original feature averages with the scaled feature averages
    sub_seq_all[feature_average_columns] = feature_averages_scaled

    sub_seq_all['start_month'] = pd.to_datetime(tmp.first_day_of_month.values[0: sub_seq_start + 1])
    sub_seq_all['population_average'] = np.round(np.mean(tmp.active.values/tmp.microbusiness_density.values * 100))
    train_seq_all.append(sub_seq_all)

train_seq_all = pd.concat(train_seq_all, axis=0).reset_index(drop=True)

train_seq_all = train_seq_all.rename(columns={24: 'target'})


  sub_seq_all['population_average'] = np.round(np.mean(tmp.active.values/tmp.microbusiness_density.values * 100))
100%|██████████| 3085/3085 [00:11<00:00, 279.64it/s]


In [157]:
print(train_seq_all.columns.tolist())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 'target', 'cfips', 'state', 'microbusiness_density_average', 'active_average', 'confirmed_average', 'deaths_average', 'DGS10_last_average', 'total_pop_average', 'start_month', 'population_average']


In [158]:
train_seq_all[train_seq_all.cfips == 1001]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,cfips,state,microbusiness_density_average,active_average,confirmed_average,deaths_average,DGS10_last_average,total_pop_average,start_month,population_average
0,3.007682,2.88487,3.055843,2.993233,2.993233,2.96909,2.909326,2.933231,3.000167,3.004948,...,1001,Alabama,3.070867,1286.24,0.0,0.0,1.228,2264340.16,2019-08-01,42020.0
1,2.88487,3.055843,2.993233,2.993233,2.96909,2.909326,2.933231,3.000167,3.004948,3.019292,...,1001,Alabama,3.078029,1290.04,0.070541,0.07025,1.1992,2263180.64,2019-09-01,42020.0
2,3.055843,2.993233,2.993233,2.96909,2.909326,2.933231,3.000167,3.004948,3.019292,3.083837,...,1001,Alabama,3.090767,1296.16,0.147981,0.154177,1.2,2261922.52,2019-10-01,42020.0
3,2.993233,2.993233,2.96909,2.909326,2.933231,3.000167,3.004948,3.019292,3.083837,3.174679,...,1001,Alabama,3.096571,1299.4,0.226501,0.240039,1.1948,2260629.36,2019-11-01,42020.0
4,2.993233,2.96909,2.909326,2.933231,3.000167,3.004948,3.019292,3.083837,3.174679,3.205756,...,1001,Alabama,3.108294,1305.12,0.309698,0.331503,1.1844,2259285.28,2019-12-01,42020.0
5,2.96909,2.909326,2.933231,3.000167,3.004948,3.019292,3.083837,3.174679,3.205756,3.193804,...,1001,Alabama,3.120436,1311.44,0.407569,0.425256,1.174,2257909.32,2020-01-01,42020.0
6,2.909326,2.933231,3.000167,3.004948,3.019292,3.083837,3.174679,3.205756,3.193804,3.038416,...,1001,Alabama,3.13505,1318.44,0.51494,0.518525,1.1688,2256906.04,2020-02-01,42020.0
7,2.933231,3.000167,3.004948,3.019292,3.083837,3.174679,3.205756,3.193804,3.038416,3.002558,...,1001,Alabama,3.152148,1326.48,0.636316,0.637773,1.1816,2256460.24,2020-03-01,42020.0
8,3.000167,3.004948,3.019292,3.083837,3.174679,3.205756,3.193804,3.038416,3.002558,2.947244,...,1001,Alabama,3.169702,1334.72,0.755111,0.757225,1.2292,2256618.48,2020-04-01,42020.0
9,3.004948,3.019292,3.083837,3.174679,3.205756,3.193804,3.038416,3.002558,2.947244,3.106106,...,1001,Alabama,3.182226,1340.84,0.878685,0.880994,1.3168,2257218.68,2020-05-01,42020.0


In [159]:
train_seq_all.to_csv('1d_cnn_train.csv')