In [1]:
## Importing libraries
import boto3
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt 
import variable_engineering as ve
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', 200)

In [None]:
## Reading cleaned data from MiceForest Imputer
train = pd.read_csv('/home/ec2-user/SageMaker/Analytics_Data_Science/American_Express/Evan/amex_train_cleaned.csv')
test = pd.read_csv('/home/ec2-user/SageMaker/Analytics_Data_Science/American_Express/Evan/amex_test_cleaned.csv')

## Printing the first five observations
train.head()

In [2]:
## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'evan-callaghan-bucket'
bucket = s3.Bucket(bucket_name)

file_key = 'Kaggle-American-Express-Default/amex_train_data.csv'
file_key2 = 'Kaggle-American-Express-Default/amex_train_labels.csv'

bucket_object = bucket.Object(file_key)
bucket_object2 = bucket.Object(file_key2)

file_object = bucket_object.get()
file_object2 = bucket_object2.get()

file_content_stream = file_object.get('Body')
file_content_stream2 = file_object2.get('Body')

## Creating data-type dictionary for reading the train data-frame
dtype_dict = {'customer_ID': "object", 'S_2': "object", 'P_2': 'float16', 'D_39': 'float16', 'B_1': 'float16','B_2': 'float16', 'R_1': 'float16','S_3': 'float16','D_41': 'float16','B_3': 'float16','D_42': 'float16','D_43': 'float16','D_44': 'float16', 'B_4': 'float16','D_45': 'float16','B_5': 'float16','R_2': 'float16','D_46': 'float16','D_47': 'float16','D_48': 'float16', 'D_49': 'float16','B_6': 'float16','B_7': 'float16','B_8': 'float16','D_50': 'float16','D_51': 'float16','B_9': 'float16', 'R_3': 'float16','D_52': 'float16','P_3': 'float16','B_10': 'float16','D_53': 'float16','S_5': 'float16','B_11': 'float16', 'S_6': 'float16','D_54': 'float16','R_4': 'float16','S_7': 'float16','B_12': 'float16','S_8': 'float16','D_55': 'float16', 'D_56': 'float16','B_13': 'float16','R_5': 'float16','D_58': 'float16','S_9': 'float16','B_14': 'float16','D_59': 'float16', 'D_60': 'float16','D_61': 'float16','B_15': 'float16','S_11': 'float16','D_62': 'float16','D_63': 'object','D_64': 'object', 'D_65': 'float16','B_16': 'float16','B_17': 'float16','B_18': 'float16','B_19': 'float16','D_66': 'float16','B_20': 'float16', 'D_68': 'float16','S_12': 'float16','R_6': 'float16','S_13': 'float16','B_21': 'float16','D_69': 'float16','B_22': 'float16', 'D_70': 'float16','D_71': 'float16','D_72': 'float16','S_15': 'float16','B_23': 'float16','D_73': 'float16','P_4': 'float16', 'D_74': 'float16','D_75': 'float16','D_76': 'float16','B_24': 'float16','R_7': 'float16','D_77': 'float16','B_25': 'float16', 'B_26': 'float16','D_78': 'float16','D_79': 'float16','R_8': 'float16','R_9': 'float16','S_16': 'float16','D_80': 'float16', 'R_10': 'float16','R_11': 'float16','B_27': 'float16','D_81': 'float16','D_82': 'float16','S_17': 'float16','R_12': 'float16', 'B_28': 'float16','R_13': 'float16','D_83': 'float16','R_14': 'float16','R_15': 'float16','D_84': 'float16','R_16': 'float16', 'B_29': 'float16','B_30': 'float16','S_18': 'float16','D_86': 'float16','D_87': 'float16','R_17': 'float16','R_18': 'float16', 'D_88': 'float16','B_31': 'int64','S_19': 'float16','R_19': 'float16','B_32': 'float16','S_20': 'float16','R_20': 'float16', 'R_21': 'float16','B_33': 'float16','D_89': 'float16','R_22': 'float16','R_23': 'float16','D_91': 'float16','D_92': 'float16', 'D_93': 'float16','D_94': 'float16','R_24': 'float16','R_25': 'float16','D_96': 'float16','S_22': 'float16','S_23': 'float16', 'S_24': 'float16','S_25': 'float16','S_26': 'float16','D_102': 'float16','D_103': 'float16','D_104': 'float16','D_105': 'float16', 'D_106': 'float16','D_107': 'float16','B_36': 'float16','B_37': 'float16', 'R_26': 'float16','R_27': 'float16','B_38': 'float16', 'D_108': 'float16','D_109': 'float16','D_110': 'float16','D_111': 'float16','B_39': 'float16','D_112': 'float16','B_40': 'float16', 'S_27': 'float16','D_113': 'float16','D_114': 'float16','D_115': 'float16','D_116': 'float16','D_117': 'float16','D_118': 'float16', 'D_119': 'float16','D_120': 'float16','D_121': 'float16','D_122': 'float16','D_123': 'float16','D_124': 'float16','D_125': 'float16', 'D_126': 'float16','D_127': 'float16','D_128': 'float16','D_129': 'float16','B_41': 'float16','B_42': 'float16','D_130': 'float16', 'D_131': 'float16','D_132': 'float16','D_133': 'float16','R_28': 'float16','D_134': 'float16','D_135': 'float16','D_136': 'float16', 'D_137': 'float16','D_138': 'float16','D_139': 'float16','D_140': 'float16','D_141': 'float16','D_142': 'float16','D_143': 'float16', 'D_144': 'float16','D_145': 'float16'}

## Reading the data
train = pd.read_csv(file_content_stream, dtype = dtype_dict)
labels = pd.read_csv(file_content_stream2, dtype = dtype_dict)

## Subsetting the data for Payment and Spend variables
train = train[['customer_ID', 'P_2', 'P_3', 'P_4', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_17', 
               'S_18', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27']]

## Appending target variables
train = pd.merge(train, labels, on = 'customer_ID', how = 'left')

## Printing the first five observations
train.head()

Unnamed: 0,customer_ID,P_2,P_3,P_4,S_3,S_5,S_6,S_7,S_8,S_9,S_11,S_12,S_13,S_15,S_16,S_17,S_18,S_19,S_20,S_22,S_23,S_24,S_25,S_26,S_27,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.938477,0.736328,0.007553,0.124023,0.023376,0.008324,0.161377,0.922852,0.065735,0.401611,0.271973,0.515137,0.108276,0.002272,0.008034,0.005722,0.002537,0.009705,0.894043,0.13562,0.911133,0.974609,0.001244,0.676758,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.936523,0.720703,0.004833,0.126709,0.030594,0.002481,0.140991,0.919434,0.093933,0.40625,0.188965,0.509277,0.101013,0.009811,0.000761,0.007584,0.00843,0.009926,0.902344,0.136353,0.919922,0.975586,0.004562,0.822266,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.954102,0.738281,0.006561,0.123962,0.04837,0.00053,0.112244,1.001953,0.084778,0.406738,0.495361,0.679199,0.10321,0.009361,0.004055,0.005901,0.007328,0.008446,0.939453,0.134888,0.958496,0.974121,0.011734,0.853516,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.960449,0.741699,0.00956,0.117188,0.03006,0.000783,0.102844,0.704102,0.04837,0.405273,0.508789,0.515137,0.206421,0.004875,0.006969,0.00252,0.007053,0.006615,0.913086,0.140015,0.92627,0.975586,0.007572,0.844727,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.947266,0.691895,0.008156,0.11731,0.05423,0.006699,0.094299,0.916992,0.039246,0.487549,0.216553,0.507812,0.106018,0.007446,0.00177,0.000155,0.007729,0.005512,0.920898,0.131592,0.933594,0.978027,0.018204,0.811035,0


### Variable Engineering:

#### Payment Variables

In [None]:
p_variables = train.columns
[x for x in p_variables if x.startswith('P_')]

In [None]:
## Creating a new figure
fig, axes = plt.subplots(2, 2, figsize = (18, 12))

sns.boxplot(ax = axes[0, 0], x = 'target', y = 'P_2', hue = 'target', data = train)
sns.boxplot(ax = axes[0, 1], x = 'target', y = 'P_3', hue = 'target', data = train)
sns.boxplot(ax = axes[1, 0], x = 'target', y = 'P_4', hue = 'target', data = train)

In [None]:
## Sum of 'P' variables
sum_value = train.groupby('customer_ID')[['P_2', 'P_3', 'P_4']].sum().reset_index(drop = False)
target = train.groupby('customer_ID')['target'].max().reset_index(drop = False)
target_sums = sum_value.merge(target, how = 'left', on = 'customer_ID')
target_sums.columns = ['customer_ID', 'P_2_sum', 'P_3_sum', 'P_4_sum', 'target']

## Creating a new figure
fig, axes = plt.subplots(2, 2, figsize = (18, 12))

sns.boxplot(ax = axes[0, 0], x = 'target', y = 'P_2_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[0, 1], x = 'target', y = 'P_3_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[1, 0], x = 'target', y = 'P_4_sum', hue = 'target', data = target_sums)

In [4]:
## Creating new Payment variables based on analysis and using the variable_engineering.py file

# P_2_mean = ve.create_var(train, 'P_2', 'mean', 'P_2_mean')
# P_2_median = ve.create_var(train, 'P_2', 'median', 'P_2_median')
# P_2_sum = ve.create_var(train, 'P_2', 'sum', 'P_2_sum')

# P_3_mean = ve.create_var(train, 'P_3', 'mean', 'P_3_mean')
# P_3_median = ve.create_var(train, 'P_3', 'median', 'P_3_median')
# P_3_sum = ve.create_var(train, 'P_3', 'sum', 'P_3_sum')

P_2_vars = ve.create(train, 'P_2')
P_3_vars = ve.create(train, 'P_3')

In [5]:
P_2_vars.head()

Unnamed: 0,customer_ID,P_2_mean,P_2_median,P_2_sum,P_2_min,P_2_max
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.933594,0.938477,12.140625,0.868652,0.960449
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.899902,0.904785,11.695312,0.861328,0.929199
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.878418,0.884766,11.421875,0.797852,0.904297
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.599121,0.598145,7.785156,0.567383,0.623535
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.891602,0.879395,11.59375,0.805176,0.94043


In [3]:
train.head()

Unnamed: 0,customer_ID,P_2,P_3,P_4,S_3,S_5,S_6,S_7,S_8,S_9,S_11,S_12,S_13,S_15,S_16,S_17,S_18,S_19,S_20,S_22,S_23,S_24,S_25,S_26,S_27,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.938477,0.736328,0.007553,0.124023,0.023376,0.008324,0.161377,0.922852,0.065735,0.401611,0.271973,0.515137,0.108276,0.002272,0.008034,0.005722,0.002537,0.009705,0.894043,0.13562,0.911133,0.974609,0.001244,0.676758,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.936523,0.720703,0.004833,0.126709,0.030594,0.002481,0.140991,0.919434,0.093933,0.40625,0.188965,0.509277,0.101013,0.009811,0.000761,0.007584,0.00843,0.009926,0.902344,0.136353,0.919922,0.975586,0.004562,0.822266,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.954102,0.738281,0.006561,0.123962,0.04837,0.00053,0.112244,1.001953,0.084778,0.406738,0.495361,0.679199,0.10321,0.009361,0.004055,0.005901,0.007328,0.008446,0.939453,0.134888,0.958496,0.974121,0.011734,0.853516,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.960449,0.741699,0.00956,0.117188,0.03006,0.000783,0.102844,0.704102,0.04837,0.405273,0.508789,0.515137,0.206421,0.004875,0.006969,0.00252,0.007053,0.006615,0.913086,0.140015,0.92627,0.975586,0.007572,0.844727,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.947266,0.691895,0.008156,0.11731,0.05423,0.006699,0.094299,0.916992,0.039246,0.487549,0.216553,0.507812,0.106018,0.007446,0.00177,0.000155,0.007729,0.005512,0.920898,0.131592,0.933594,0.978027,0.018204,0.811035,0


In [None]:
max_temp = pd.DataFrame(data.groupby(['customer_ID'])[variable].max()).reset_index(drop = False)

['D_65_avg_pct_change'] = np.where(x['D_65'].shape[0] == 1, 0, pd.Series(x['D_65'].to_list()).pct_change().mean())

In [7]:
def new_features(data):
    
    temp = {}
    temp[name + '_mean'] = data[name].mean()
    temp[name + '_median'] = data[name].median()
    temp[name + '_min'] = data[name].min()
    temp[name + '_max'] = data[name].max()
    temp[name + '_range'] = np.where(data[name].shape[0] == 1, 0, data[name].max() - data[name].min())
    temp[name + '_iqr'] = np.where(data[name].shape[0] == 1, 0, np.percentile(data[name], 75) - np.percentile(data[name], 25))
    temp[name + '_std'] = np.where(data[name].shape[0] == 1, 0, np.std(data[name], ddof = 1))
#     d['D_65_negative_count'] = np.sum(x['D_65'] < 0) 
#     d['D_65_positive_count'] = np.sum(x['D_65'] > 0)
#     d['D_65_pct_values_above_mean'] = np.where(x[name].shape[0] == 1, 0, np.sum(x[name] > x[name].mean())/x[name].shape[0])
#     d['D_65_avg_pct_change'] = np.where(x[name].shape[0] == 1, 0, pd.Series(x[name].to_list()).pct_change().mean())
    
#     return pd.Series(d, index = ['D_65_mean', 'D_65_median', 'D_65_min', 'D_65_max', 'D_65_range', 'D_65_IQR', 'D_65_std', 'D_65_pct_values_above_mean', 'D_65_avg_pct_change'])

    return temp

In [9]:
data_out = train.groupby('customer_ID').apply(new_features, **kwargs = 'P_2')
data_out['customer_ID'] = data_out.index
data_out = data_out.reset_index(drop = True)

NameError: name 'name' is not defined

In [None]:
## Concatenating all feature engineering data-frames into a single object
payment_vars = pd.concat([P_2_mean, P_2_median.iloc[:, 1], P_2_sum.iloc[:, 1], P_3_mean.iloc[:, 1], 
                          P_3_median.iloc[:, 1], P_3_sum.iloc[:, 1]], axis = 1)

payment_vars.head()

#### Spend Variables

In [None]:
s_variables = train.columns
[x for x in s_variables if x.startswith('S_')]

In [None]:
## Creating a new figure
fig, axes = plt.subplots(11, 2, figsize = (18, 100))

sns.boxplot(ax = axes[0, 0], x = 'target', y = 'S_3', hue = 'target', data = train)
sns.boxplot(ax = axes[0, 1], x = 'target', y = 'S_5', hue = 'target', data = train)
sns.boxplot(ax = axes[1, 0], x = 'target', y = 'S_6', hue = 'target', data = train)
sns.boxplot(ax = axes[1, 1], x = 'target', y = 'S_7', hue = 'target', data = train)
sns.boxplot(ax = axes[2, 0], x = 'target', y = 'S_8', hue = 'target', data = train)
sns.boxplot(ax = axes[2, 1], x = 'target', y = 'S_9', hue = 'target', data = train)
sns.boxplot(ax = axes[3, 0], x = 'target', y = 'S_11', hue = 'target', data = train)
sns.boxplot(ax = axes[3, 1], x = 'target', y = 'S_12', hue = 'target', data = train)
sns.boxplot(ax = axes[4, 0], x = 'target', y = 'S_13', hue = 'target', data = train)
sns.boxplot(ax = axes[4, 1], x = 'target', y = 'S_15', hue = 'target', data = train)
sns.boxplot(ax = axes[5, 0], x = 'target', y = 'S_16', hue = 'target', data = train)
sns.boxplot(ax = axes[5, 1], x = 'target', y = 'S_17', hue = 'target', data = train)
sns.boxplot(ax = axes[6, 0], x = 'target', y = 'S_18', hue = 'target', data = train)
sns.boxplot(ax = axes[6, 1], x = 'target', y = 'S_19', hue = 'target', data = train)
sns.boxplot(ax = axes[7, 0], x = 'target', y = 'S_20', hue = 'target', data = train)
sns.boxplot(ax = axes[7, 1], x = 'target', y = 'S_22', hue = 'target', data = train)
sns.boxplot(ax = axes[8, 0], x = 'target', y = 'S_23', hue = 'target', data = train)
sns.boxplot(ax = axes[8, 1], x = 'target', y = 'S_24', hue = 'target', data = train)
sns.boxplot(ax = axes[9, 0], x = 'target', y = 'S_25', hue = 'target', data = train)
sns.boxplot(ax = axes[9, 1], x = 'target', y = 'S_26', hue = 'target', data = train)
sns.boxplot(ax = axes[10, 0], x = 'target', y = 'S_27', hue = 'target', data = train)

In [None]:
## Sum of 'S' variables
sum_value = train.groupby('customer_ID')[['S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 
                                          'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_17', 
                                          'S_18', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 
                                          'S_25', 'S_26', 'S_27']].sum().reset_index(drop = False)
target = train.groupby('customer_ID')['target'].max().reset_index(drop = False)
target_sums = sum_value.merge(target, how = 'left', on = 'customer_ID')
target_sums.columns = ['customer_ID', 'S_3_sum', 'S_5_sum', 'S_6_sum', 'S_7_sum', 'S_8_sum', 'S_9_sum', 
                                          'S_11_sum', 'S_12_sum', 'S_13_sum', 'S_15_sum', 'S_16_sum', 'S_17_sum', 
                                          'S_18_sum', 'S_19_sum', 'S_20_sum', 'S_22_sum', 'S_23_sum', 'S_24_sum', 
                                          'S_25_sum', 'S_26_sum', 'S_27_sum', 'target']

In [None]:
## Creating a new figure
fig, axes = plt.subplots(11, 2, figsize = (18, 100))

sns.boxplot(ax = axes[0, 0], x = 'target', y = 'S_3_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[0, 1], x = 'target', y = 'S_5_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[1, 0], x = 'target', y = 'S_6_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[1, 1], x = 'target', y = 'S_7_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[2, 0], x = 'target', y = 'S_8_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[2, 1], x = 'target', y = 'S_9_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[3, 0], x = 'target', y = 'S_11_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[3, 1], x = 'target', y = 'S_12_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[4, 0], x = 'target', y = 'S_13_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[4, 1], x = 'target', y = 'S_15_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[5, 0], x = 'target', y = 'S_16_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[5, 1], x = 'target', y = 'S_17_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[6, 0], x = 'target', y = 'S_18_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[6, 1], x = 'target', y = 'S_19_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[7, 0], x = 'target', y = 'S_20_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[7, 1], x = 'target', y = 'S_22_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[8, 0], x = 'target', y = 'S_23_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[8, 1], x = 'target', y = 'S_24_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[9, 0], x = 'target', y = 'S_25_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[9, 1], x = 'target', y = 'S_26_sum', hue = 'target', data = target_sums)
sns.boxplot(ax = axes[10, 0], x = 'target', y = 'S_27_sum', hue = 'target', data = target_sums)

#### Selections: S_3, S_3_sum, S_6_sum, S_7, S_7_sum, S_8, S_8_sum, S_13_sum, S_15, S_15_sum

In [None]:
## Creating new Spend variables based on analysis and using the variable_engineering.py file

S_3_mean = ve.create_var(train, 'S_3', 'mean', 'S_3_mean')
S_3_median = ve.create_var(train, 'S_3', 'median', 'S_3_median')
S_3_sum = ve.create_var(train, 'S_3', 'sum', 'S_3_sum')

S_6_sum = ve.create_var(train, 'S_6', 'sum', 'S_6_sum')

S_7_mean = ve.create_var(train, 'S_7', 'mean', 'S_7_mean')
S_7_median = ve.create_var(train, 'S_7', 'median', 'S_7_median')
S_7_sum = ve.create_var(train, 'S_7', 'sum', 'S_7_sum')

S_8_mean = ve.create_var(train, 'S_8', 'mean', 'S_8_mean')
S_8_median = ve.create_var(train, 'S_8', 'median', 'S_8_median')
S_8_sum = ve.create_var(train, 'S_8', 'sum', 'S_8_sum')

S_13_sum = ve.create_var(train, 'S_13', 'sum', 'S_13_sum')

S_15_mean = ve.create_var(train, 'S_15', 'mean', 'S_15_mean')
S_15_median = ve.create_var(train, 'S_15', 'median', 'S_15_median')
S_15_sum = ve.create_var(train, 'S_15', 'sum', 'S_15_sum')

## Concatenating all data-frames into a single object
spend_vars = pd.concat([S_3_mean, S_3_median.iloc[:, 1], S_3_sum.iloc[:, 1], S_6_sum.iloc[:, 1], S_7_mean.iloc[:, 1], S_7_median.iloc[:, 1], 
                        S_7_sum.iloc[:, 1], S_8_mean.iloc[:, 1], S_8_median.iloc[:, 1], S_8_sum.iloc[:, 1], S_13_sum.iloc[:, 1], 
                        S_15_mean.iloc[:, 1], S_15_median.iloc[:, 1], S_15_sum.iloc[:, 1]], axis = 1)

spend_vars.head()

In [None]:
spend_vars.at[2, 'customer_ID']

In [None]:
train[train['customer_ID'] == '00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1']

#### Merging newly created variables into single data-frame:

In [None]:
cleaned = pd.concat([payment_vars, spend_vars.iloc[:, 1:]], axis = 1)

cleaned.head()

In [None]:
cleaned.shape

### Variable Importance

In [None]:
## Using the train data-frame to engineer variable interactions

## Defining the input and target variables
X = train[['P_2', 'P_3', 'P_4', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9', 'S_11', 'S_12', 'S_13', 'S_15', 
           'S_16', 'S_17', 'S_18', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27']]
Y = train['target']

## Defining a list to store results
results = []

## Repeating process 100 times
for i in tqdm(range(0, 5)):
    
    ## Splitting the data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.8, stratify = Y)
    
    ## Building the model
    rf_md = RandomForestClassifier(max_depth = 3, n_estimators = 100).fit(X_train, Y_train)
    
    ## Extracting feature importance scores
    results.append(rf_md.feature_importances_)
    
## Changing results list to a dataframe
results = pd.DataFrame(results, columns = X.columns)

## Computing averages and sorting variables by importance
results = pd.DataFrame(results.apply(np.mean, axis = 0))
results = pd.DataFrame({'Feature': results.index, 'Importance': results[0].values}).sort_values(by = 'Importance', ascending = False)

results