In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
import warnings
import time
import sys
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / (1024 ** 2)    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / (1024 ** 2)
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
def binarize(df):
    for col in ('authorized_flag', 'category_1'):
        df[col] = df[col].map({'Y': 1, 'N': 0})
    return df

In [4]:
#np.iinfo?

In [5]:
def read_data(input_file_patch):
    df = pd.read_csv(input_file_patch)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days
    return df

In [76]:
def aggregate_transactions(history):
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).astype(np.int64) * 1e-9
    print("\n after history.loc\n", history.head())
    agg_func = {
        'category_1': ['sum', 'mean'],
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        'merchant_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'subsector_id': ['nunique'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],
        'purchase_month': ['mean', 'max', 'min', 'std'],
        'purchase_date': [np.ptp, 'min', 'max'],
        'month_lag': ['mean', 'max', 'min', 'std'],
        'month_diff': ['mean']
    }
    agg_history = history.groupby(['card_id']).agg(agg_func)
    print("\n 1 agg_history:\n", agg_history.head())
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    print("\n 2 agg_history:\n", agg_history.head())
    agg_history.reset_index(inplace=True)
    print("\n 3 agg_history:\n", agg_history.head())
    df = (history.groupby('card_id').size().reset_index(name='transactions_count'))
    print("\n 4 df:\n", agg_history.head())
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    print("\n 5 agg_history:\n", agg_history.head())
    return agg_history

In [68]:
def aggregate_per_month(history):
    grouped = history.groupby(['card_id', 'month_lag'])
    
    agg_func = {
        'purchase_amount': ['count', 'sum', 'mean', 'min', 'max', 'std'],
        'installments': ['count', 'sum', 'mean', 'min', 'max', 'std'],
    }
    
    intermediate_group = grouped.agg(agg_func)
    intermediate_group = ['_'.join(col).strip() for col in intermediate_group.columns.values]
    intermediate_group.reset_index(inplace=True)
    
    final_group = intermediate_group.groupby('card_id').agg(['mean', 'std'])
    final_group = ['_'.join(col).strip() for col in final_group.columns.values]
    final_group.reset_index(inplace=True)
    
    return final_group

In [6]:
new_transactions_df = pd.read_csv('../input/new_merchant_transactions.csv', parse_dates=['purchase_date'])

In [7]:
historical_transactions_df = pd.read_csv('../input/historical_transactions.csv', parse_dates=['purchase_date'])

In [8]:
new_transactions_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [9]:
historical_transactions_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37


In [10]:
new_transactions_df = binarize(new_transactions_df)

In [11]:
historical_transactions_df = binarize(historical_transactions_df)

In [19]:
print(historical_transactions_df[historical_transactions_df['authorized_flag'] == 1].shape[0])
print(historical_transactions_df[historical_transactions_df['authorized_flag'] == 0].shape[0])
print(historical_transactions_df[historical_transactions_df['category_1'] == 1].shape[0])
print(historical_transactions_df[historical_transactions_df['category_1'] == 0].shape[0])

26595452
2516909
2084029
27028332


In [23]:
train = read_data('../input/train.csv')

In [24]:
test = read_data('../input/test.csv')

In [25]:
target = train['target']
del train['target']

In [26]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,elapsed_time
0,2017-06-01,C_ID_92a2005557,5,2,1,245
1,2017-01-01,C_ID_3d0044924f,4,1,0,396
2,2016-08-01,C_ID_d639edf6cd,2,2,0,549
3,2017-09-01,C_ID_186d6a6901,4,3,0,153
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,92


In [27]:
historical_transactions_df['month_diff'] = (
    (datetime.datetime.today() - historical_transactions_df['purchase_date']).dt.days
) // 30
historical_transactions_df['month_diff'] += historical_transactions_df['month_lag']

new_transactions_df['month_diff'] = ((datetime.datetime.today() - new_transactions_df['purchase_date']).dt.days) // 30
new_transactions_df['month_diff'] += new_transactions_df['month_lag']

In [28]:
historical_transactions_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,month_diff
0,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,11
1,1,C_ID_4e6213e9bc,88,0,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,11
2,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,12
3,1,C_ID_4e6213e9bc,88,0,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,12
4,1,C_ID_4e6213e9bc,88,0,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,12


In [30]:
historical_transactions_df = pd.get_dummies(historical_transactions_df, columns=['category_2', 'category_3'])

In [31]:
new_transactions_df = pd.get_dummies(new_transactions_df, columns=['category_2', 'category_3'])

In [32]:
historical_transactions_df.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,month_diff,category_2_1.0,category_2_2.0,category_2_3.0,category_2_4.0,category_2_5.0,category_3_A,category_3_B,category_3_C
0,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,16,37,11,1,0,0,0,0,1,0,0
1,1,C_ID_4e6213e9bc,88,0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,16,16,11,1,0,0,0,0,1,0,0
2,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,16,37,12,1,0,0,0,0,1,0,0
3,1,C_ID_4e6213e9bc,88,0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,16,34,12,1,0,0,0,0,1,0,0
4,1,C_ID_4e6213e9bc,88,0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,16,37,12,1,0,0,0,0,1,0,0


In [33]:
historical_transactions_df = reduce_mem_usage(historical_transactions_df)

Mem. usage decreased to 1332.66 Mb (57.1% reduction)


In [34]:
new_transactions_df = reduce_mem_usage(new_transactions_df)

Mem. usage decreased to 86.12 Mb (58.9% reduction)


In [47]:
agg_fun = {'authorized_flag' : ('mean',)}
auth_mean = historical_transactions_df.groupby('card_id').agg(agg_fun)

In [52]:
#print(type(auth_mean))

In [53]:
#auth_mean.columns

In [54]:
#auth_mean.head()

In [50]:
auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns.values]

In [56]:
auth_mean.head()

Unnamed: 0_level_0,authorized_flag_mean
card_id,Unnamed: 1_level_1
C_ID_00007093c1,0.765101
C_ID_0001238066,0.97561
C_ID_0001506ef0,0.939394
C_ID_0001793786,0.875
C_ID_000183fdda,0.951389


In [57]:
auth_mean.reset_index(inplace=True)

In [58]:
auth_mean.head()

Unnamed: 0,card_id,authorized_flag_mean
0,C_ID_00007093c1,0.765101
1,C_ID_0001238066,0.97561
2,C_ID_0001506ef0,0.939394
3,C_ID_0001793786,0.875
4,C_ID_000183fdda,0.951389


In [59]:
auth_mean.index

RangeIndex(start=0, stop=325540, step=1)

In [60]:
auth_mean.columns

Index(['card_id', 'authorized_flag_mean'], dtype='object')

In [61]:
authorized_transactions_df = historical_transactions_df[historical_transactions_df['authorized_flag'] == 1]
historical_transactions_df = historical_transactions_df[historical_transactions_df['authorized_flag'] == 0]

In [64]:
print(authorized_transactions_df.shape)
print(historical_transactions_df.shape)

(26595452, 21)
(2516909, 21)


In [66]:
historical_transactions_df['purchase_month'] = historical_transactions_df['purchase_date'].dt.month
authorized_transactions_df['purchase_month'] = authorized_transactions_df['purchase_date'].dt.month
new_transactions_df['purchase_month'] = new_transactions_df['purchase_date'].dt.month

In [82]:
history = aggregate_transactions(historical_transactions_df)


 after history.loc
      authorized_flag          card_id  city_id  category_1  installments  \
115                0  C_ID_4e6213e9bc       88           0             0   
132                0  C_ID_4e6213e9bc       88           0             0   
148                0  C_ID_4e6213e9bc       88           0             0   
168                0  C_ID_4e6213e9bc      333           0             0   
213                0  C_ID_4e6213e9bc       88           0             0   

     merchant_category_id      merchant_id  month_lag  purchase_amount  \
115                   842  M_ID_22c9cfa265        -10        -0.730379   
132                   367  M_ID_86ec983688         -5        -0.723782   
148                   367  M_ID_86ec983688         -5        -0.723782   
168                   605  M_ID_c2ae34c2ef          0        -0.664262   
213                   560  M_ID_e6d5ae8ea6         -7        -0.738132   

     purchase_date  state_id  subsector_id  month_diff  category_2_1.0  \
115


 4 df:
            card_id  category_1_sum  category_1_mean  category_2_1.0_mean  \
0  C_ID_00007093c1             4.0         0.114286             0.000000   
1  C_ID_0001238066             0.0         0.000000             0.333333   
2  C_ID_0001506ef0             0.0         0.000000             0.000000   
3  C_ID_0001793786             2.0         0.074074             0.111111   
4  C_ID_000183fdda             0.0         0.000000             0.000000   

   category_2_2.0_mean  category_2_3.0_mean  category_2_4.0_mean  \
0             0.000000             0.885714                  0.0   
1             0.000000             0.000000                  0.0   
2             0.000000             1.000000                  0.0   
3             0.296296             0.111111                  0.0   
4             0.000000             1.000000                  0.0   

   category_2_5.0_mean  category_3_A_mean  category_3_B_mean  \
0             0.000000           0.000000           0.828571 

In [78]:
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]

In [79]:
history.head()

Unnamed: 0,card_id,hist_transactions_count,hist_category_1_sum,hist_category_1_mean,hist_category_2_1.0_mean,hist_category_2_2.0_mean,hist_category_2_3.0_mean,hist_category_2_4.0_mean,hist_category_2_5.0_mean,hist_category_3_A_mean,hist_category_3_B_mean,hist_category_3_C_mean,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_state_id_nunique,hist_city_id_nunique,hist_subsector_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_std,hist_installments_sum,hist_installments_mean,hist_installments_max,hist_installments_min,hist_installments_std,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_month_std,hist_purchase_date_ptp,hist_purchase_date_min,hist_purchase_date_max,hist_month_lag_mean,hist_month_lag_max,hist_month_lag_min,hist_month_lag_std,hist_month_diff_mean
0,C_ID_00007093c1,35,4.0,0.114286,0.0,0.0,0.885714,0.0,0.0,0.0,0.828571,0.171429,11,8,2,3,7,-14.401965,-0.411485,1.507069,-0.728876,0.430059,45,1.285714,3,1,0.667367,5.914286,12,1,3.071419,0.0,0.0,0.0,-6.028571,0,-11,3.535415,11.514286
1,C_ID_0001238066,3,0.0,0.0,0.333333,0.0,0.0,0.0,0.666667,0.0,1.0,0.0,2,1,2,2,1,-1.789928,-0.596643,-0.596643,-0.596643,0.0,3,1.0,1,1,0.0,8.666667,12,2,5.773503,0.0,0.0,0.0,-1.333333,0,-2,1.154701,11.0
2,C_ID_0001506ef0,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.75,0.25,0.0,4,4,1,1,4,-2.905782,-0.726445,-0.70859,-0.740491,0.016203,1,0.25,1,0,0.5,5.5,12,2,4.725816,0.0,0.0,0.0,-2.5,0,-8,3.785939,11.5
3,C_ID_0001793786,27,2.0,0.074074,0.111111,0.296296,0.111111,0.0,0.0,0.814815,0.185185,0.0,17,14,4,8,11,-11.825338,-0.437975,1.236592,-0.745405,0.49815,5,0.185185,1,0,0.395847,6.481481,9,3,2.375684,0.0,0.0,0.0,-3.518519,-1,-7,2.375684,15.444444
4,C_ID_000183fdda,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.428571,0.571429,4,4,1,1,4,-0.941381,-0.134483,0.334099,-0.714541,0.452741,19,2.714286,6,1,2.288689,8.285714,10,8,0.755929,0.0,0.0,0.0,-5.714286,-4,-6,0.755929,11.285714


In [80]:
authorized = aggregate_transactions(authorized_transactions_df)


 after history.loc
    authorized_flag          card_id  city_id  category_1  installments  \
0                1  C_ID_4e6213e9bc       88           0             0   
1                1  C_ID_4e6213e9bc       88           0             0   
2                1  C_ID_4e6213e9bc       88           0             0   
3                1  C_ID_4e6213e9bc       88           0             0   
4                1  C_ID_4e6213e9bc       88           0             0   

   merchant_category_id      merchant_id  month_lag  purchase_amount  \
0                    80  M_ID_e020e9b302         -8        -0.703331   
1                   367  M_ID_86ec983688         -7        -0.733128   
2                    80  M_ID_979ed661fc         -6        -0.720386   
3                   560  M_ID_e6d5ae8ea6         -5        -0.735352   
4                    80  M_ID_e020e9b302        -11        -0.722865   

   purchase_date  state_id  subsector_id  month_diff  category_2_1.0  \
0   1.498405e+09        16   


 4 df:
            card_id  category_1_sum  category_1_mean  category_2_1.0_mean  \
0  C_ID_00007093c1            24.0         0.210526             0.000000   
1  C_ID_0001238066             2.0         0.016667             0.783333   
2  C_ID_0001506ef0             0.0         0.000000             0.032258   
3  C_ID_0001793786             0.0         0.000000             0.042328   
4  C_ID_000183fdda             4.0         0.029197             0.051095   

   category_2_2.0_mean  category_2_3.0_mean  category_2_4.0_mean  \
0             0.000000             0.780702                  0.0   
1             0.000000             0.000000                  0.0   
2             0.000000             0.967742                  0.0   
3             0.359788             0.063492                  0.0   
4             0.007299             0.905109                  0.0   

   category_2_5.0_mean  category_3_A_mean  category_3_B_mean  \
0             0.008772                0.0           0.842105 

In [83]:
authorized.columns = ['auth_' + c if c != 'card_id' else c for c in authorized.columns]

In [84]:
authorized.head()

Unnamed: 0,card_id,auth_transactions_count,auth_category_1_sum,auth_category_1_mean,auth_category_2_1.0_mean,auth_category_2_2.0_mean,auth_category_2_3.0_mean,auth_category_2_4.0_mean,auth_category_2_5.0_mean,auth_category_3_A_mean,auth_category_3_B_mean,auth_category_3_C_mean,auth_merchant_id_nunique,auth_merchant_category_id_nunique,auth_state_id_nunique,auth_city_id_nunique,auth_subsector_id_nunique,auth_purchase_amount_sum,auth_purchase_amount_mean,auth_purchase_amount_max,auth_purchase_amount_min,auth_purchase_amount_std,auth_installments_sum,auth_installments_mean,auth_installments_max,auth_installments_min,auth_installments_std,auth_purchase_month_mean,auth_purchase_month_max,auth_purchase_month_min,auth_purchase_month_std,auth_purchase_date_ptp,auth_purchase_date_min,auth_purchase_date_max,auth_month_lag_mean,auth_month_lag_max,auth_month_lag_min,auth_month_lag_std,auth_month_diff_mean
0,C_ID_00007093c1,114,24.0,0.210526,0.0,0.0,0.780702,0.0,0.008772,0.0,0.842105,0.157895,28,18,3,4,13,-62.443077,-0.547746,1.206539,-0.728876,0.237426,147,1.289474,6,1,0.795159,6.517544,12,1,3.37149,32627654.0,1487081000.0,1519708000.0,-5.798246,0,-12,3.441495,11.596491
1,C_ID_0001238066,120,2.0,0.016667,0.783333,0.0,0.0,0.0,0.15,0.0,0.708333,0.266667,65,29,6,18,17,-70.657272,-0.588811,0.768095,-0.734887,0.192614,195,1.625,10,-1,1.50105,7.275,12,1,4.895483,13110825.0,1506638000.0,1519748000.0,-1.825,0,-5,1.294218,11.266667
2,C_ID_0001506ef0,62,0.0,0.0,0.032258,0.0,0.967742,0.0,0.0,1.0,0.0,0.0,28,19,2,3,12,-31.696098,-0.511227,1.493545,-0.740491,0.484575,0,0.0,0,0,0.0,6.887097,12,1,4.538017,34460275.0,1484411000.0,1518871000.0,-4.983871,0,-13,4.248402,11.645161
3,C_ID_0001793786,189,0.0,0.0,0.042328,0.359788,0.063492,0.0,0.0,1.0,0.0,0.0,114,45,4,9,22,-24.960674,-0.132067,4.554145,-0.737892,0.867916,0,0.0,0,0,0.0,6.698413,10,1,2.301491,24487497.0,1484994000.0,1509481000.0,-3.301587,0,-9,2.301491,15.349206
4,C_ID_000183fdda,137,4.0,0.029197,0.051095,0.007299,0.905109,0.0,0.007299,0.0,0.729927,0.240876,71,34,7,9,20,-67.896553,-0.495595,2.764788,-0.737892,0.52376,245,1.788321,10,-1,2.108912,6.810219,12,1,4.538289,15148616.0,1504444000.0,1519592000.0,-2.284672,0,-5,1.782055,11.445255


In [85]:
new = aggregate_transactions(new_transactions_df)


 after history.loc
    authorized_flag          card_id  city_id  category_1  installments  \
0                1  C_ID_415bb3a509      107           0             1   
1                1  C_ID_415bb3a509      140           0             1   
2                1  C_ID_415bb3a509      330           0             1   
3                1  C_ID_415bb3a509       -1           1             1   
4                1  C_ID_ef55cf8d4b       -1           1             1   

   merchant_category_id      merchant_id  month_lag  purchase_amount  \
0                   307  M_ID_b0c793002c          1        -0.557617   
1                   307  M_ID_88920c89e8          1        -0.569336   
2                   507  M_ID_ad5237ef6b          2        -0.551270   
3                   661  M_ID_9e84cda3b1          1        -0.671875   
4                   166  M_ID_3c86fa3831          1        -0.659668   

   purchase_date  state_id  subsector_id  month_diff  category_2_1.0  \
0   3.000000e-09         9   

KeyError: "Column 'purchase_month' does not exist!"