## Import Statements

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip3 install sklearn-deap
!pip3 install sklearn-genetic-opt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import operator
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.feature_selection import SelectKBest, f_classif, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn_genetic import GASearchCV
from sklearn_genetic import ExponentialAdapter
from sklearn_genetic.space import Continuous, Categorical, Integer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

## Loading Datasets

In [5]:
df_train = pd.read_csv('/content/drive/MyDrive/SCS_TRAIN.csv')
df_test = pd.read_csv('/content/drive/MyDrive/SCS_TEST.csv')

In [6]:
print(df_train.shape)
print(df_test.shape)

(76020, 371)
(75818, 370)


In [7]:
df_train[['TARGET']]

Unnamed: 0,TARGET
0,0
1,0
2,0
3,0
4,0
...,...
76015,0
76016,0
76017,0
76018,0


## declaring y_train

In [8]:
y_train = df_train[['TARGET']].copy()
df_train.drop('TARGET',axis=1,inplace=True)

In [9]:
y_train

Unnamed: 0,TARGET
0,0
1,0
2,0
3,0
4,0
...,...
76015,0
76016,0
76017,0
76018,0


## Feature Selection and Hyperparameter tuning using smaller data points - 6000 Data points

In [184]:
# Data points size=6000
train6k = df_train[:6000]

In [143]:
ytrain6k = y_train[:6000]

In [155]:
mainFeatures = list(df_train.columns.values)

In [138]:
rfc = RandomForestClassifier()
params = {'min_weight_fraction_leaf': Continuous(0.01, 0.5, distribution='log-uniform'),
          'bootstrap': Categorical([True, False]),
          'max_depth': Integer(2, 30),
          'max_leaf_nodes': Integer(2, 35),
          'n_estimators': Integer(100, 300)}

In [139]:
mutation_adapter = ExponentialAdapter(initial_value=0.8, end_value=0.2, adaptive_rate=0.1)
crossover_adapter = ExponentialAdapter(initial_value=0.2, end_value=0.8, adaptive_rate=0.1)

In [140]:
cv = StratifiedKFold(n_splits=3, shuffle=True)

In [141]:
evolved_estimator = GASearchCV(estimator=rfc,
                               cv=cv,
                               scoring='accuracy',
                               population_size=20,
                               generations=2,
                               mutation_probability=mutation_adapter,
                               crossover_probability=crossover_adapter,
                               param_grid=params,
                               n_jobs=-1)

In [145]:
evolved_estimator.fit(train6k, ytrain6k)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.959167	2.22045e-16	0.959167   	0.959167   
1  	40    	0.959167	2.22045e-16	0.959167   	0.959167   
2  	40    	0.959167	2.22045e-16	0.959167   	0.959167   


GASearchCV(crossover_probability=<sklearn_genetic.schedules.schedulers.ExponentialAdapter object at 0x7f3ffc70ae50>,
           cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=True),
           estimator=RandomForestClassifier(bootstrap=False, max_depth=12,
                                            max_leaf_nodes=13,
                                            min_weight_fraction_leaf=0.055915906832559294,
                                            n_estimators=245),
           generations=2,
           mutation_probabil...
                       'max_depth': <sklearn_genetic.space.space.Integer object at 0x7f3ffc70aa90>,
                       'max_leaf_nodes': <sklearn_genetic.space.space.Integer object at 0x7f3ffc70a910>,
                       'min_weight_fraction_leaf': <sklearn_genetic.space.space.Continuous object at 0x7f3ffc73e7d0>,
                       'n_estimators': <sklearn_genetic.space.space.Integer object at 0x7f3ffc70ac50>},
           population_size=20,

In [149]:
bestparams6k = evolved_estimator.best_params_
bestparams6k

{'min_weight_fraction_leaf': 0.055915906832559294,
 'bootstrap': False,
 'max_depth': 12,
 'max_leaf_nodes': 13,
 'n_estimators': 245}

In [150]:
rfc = RandomForestClassifier(**bestparams6k)

In [151]:
rfc.fit(train6k, ytrain6k)

RandomForestClassifier(bootstrap=False, max_depth=12, max_leaf_nodes=13,
                       min_weight_fraction_leaf=0.055915906832559294,
                       n_estimators=245)

In [156]:
featureimp6k = rfc.feature_importances_
RFfeatureImpValue6k = dict(zip(mainFeatures,rfc.feature_importances_))
RFfeatureImpValue6k = OrderedDict(sorted(RFfeatureImpValue6k.items(),key=operator.itemgetter(1),reverse=True))
rfcFullFeatureImportance6k = list(RFfeatureImpValue6k.keys())

In [181]:
rfcFullFeatureImportance6k = rfcFullFeatureImportance6k[:15]

## Feature Selection and Hyperparameter tuning using smaller data points - 12000 Data points

In [185]:
# Data points size=12000
train12k = df_train[:12000]

In [159]:
ytrain12k = y_train[:12000]

In [160]:
evolved_estimator.fit(train12k, ytrain12k)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.960833	1.11022e-16	0.960833   	0.960833   
1  	40    	0.960833	1.11022e-16	0.960833   	0.960833   
2  	40    	0.960833	1.11022e-16	0.960833   	0.960833   


GASearchCV(crossover_probability=<sklearn_genetic.schedules.schedulers.ExponentialAdapter object at 0x7f3ffc70ae50>,
           cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=True),
           estimator=RandomForestClassifier(bootstrap=False, max_depth=26,
                                            max_leaf_nodes=10,
                                            min_weight_fraction_leaf=0.18081166385002856,
                                            n_estimators=298),
           generations=2,
           mutation_probabili...
                       'max_depth': <sklearn_genetic.space.space.Integer object at 0x7f3ffc70aa90>,
                       'max_leaf_nodes': <sklearn_genetic.space.space.Integer object at 0x7f3ffc70a910>,
                       'min_weight_fraction_leaf': <sklearn_genetic.space.space.Continuous object at 0x7f3ffc73e7d0>,
                       'n_estimators': <sklearn_genetic.space.space.Integer object at 0x7f3ffc70ac50>},
           population_size=20,

In [162]:
bestparams12k = evolved_estimator.best_params_
bestparams12k

{'min_weight_fraction_leaf': 0.18081166385002856,
 'bootstrap': False,
 'max_depth': 26,
 'max_leaf_nodes': 10,
 'n_estimators': 298}

In [163]:
rfc = RandomForestClassifier(**bestparams12k)

In [164]:
rfc.fit(train12k, ytrain12k)

RandomForestClassifier(bootstrap=False, max_depth=26, max_leaf_nodes=10,
                       min_weight_fraction_leaf=0.18081166385002856,
                       n_estimators=298)

In [165]:
featureimp12k = rfc.feature_importances_
RFfeatureImpValue12k = dict(zip(mainFeatures,rfc.feature_importances_))
RFfeatureImpValue12k = OrderedDict(sorted(RFfeatureImpValue12k.items(),key=operator.itemgetter(1),reverse=True))
rfcFullFeatureImportance12k = list(RFfeatureImpValue12k.keys())

In [180]:
rfcFullFeatureImportance12k = rfcFullFeatureImportance12k[:15]

## Feature Selection and Hyperparameter tuning using smaller data points - 18000 Data points

In [186]:
# Data points size=18000
train18k = df_train[:18000]

In [168]:
ytrain18k = y_train[:18000]

In [169]:
evolved_estimator.fit(train18k, ytrain18k)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.959444	1.11022e-16	0.959444   	0.959444   
1  	40    	0.959444	1.11022e-16	0.959444   	0.959444   
2  	40    	0.959444	1.11022e-16	0.959444   	0.959444   


GASearchCV(crossover_probability=<sklearn_genetic.schedules.schedulers.ExponentialAdapter object at 0x7f3ffc70ae50>,
           cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=True),
           estimator=RandomForestClassifier(bootstrap=False, max_depth=10,
                                            max_leaf_nodes=20,
                                            min_weight_fraction_leaf=0.02070717958400921,
                                            n_estimators=197),
           generations=2,
           mutation_probabili...
                       'max_depth': <sklearn_genetic.space.space.Integer object at 0x7f3ffc70aa90>,
                       'max_leaf_nodes': <sklearn_genetic.space.space.Integer object at 0x7f3ffc70a910>,
                       'min_weight_fraction_leaf': <sklearn_genetic.space.space.Continuous object at 0x7f3ffc73e7d0>,
                       'n_estimators': <sklearn_genetic.space.space.Integer object at 0x7f3ffc70ac50>},
           population_size=20,

In [170]:
bestparams18k = evolved_estimator.best_params_
bestparams18k

{'min_weight_fraction_leaf': 0.02070717958400921,
 'bootstrap': False,
 'max_depth': 10,
 'max_leaf_nodes': 20,
 'n_estimators': 197}

In [171]:
rfc = RandomForestClassifier(**bestparams18k)

In [172]:
rfc.fit(train18k, ytrain18k)

RandomForestClassifier(bootstrap=False, max_depth=10, max_leaf_nodes=20,
                       min_weight_fraction_leaf=0.02070717958400921,
                       n_estimators=197)

In [173]:
featureimp18k = rfc.feature_importances_
RFfeatureImpValue18k = dict(zip(mainFeatures,rfc.feature_importances_))
RFfeatureImpValue18k = OrderedDict(sorted(RFfeatureImpValue18k.items(),key=operator.itemgetter(1),reverse=True))
rfcFullFeatureImportance18k = list(RFfeatureImpValue18k.keys())

In [179]:
rfcFullFeatureImportance18k = rfcFullFeatureImportance18k[:25]

## Common Features between 6k, 12k and 18k data points

In [182]:
commonfinalk = list(set(rfcFullFeatureImportance6k) & set(rfcFullFeatureImportance12k) & set(rfcFullFeatureImportance18k))

In [183]:
commonfinalk

['saldo_var30',
 'num_var35',
 'var38',
 'saldo_var5',
 'saldo_var42',
 'var15',
 'saldo_medio_var5_ult3',
 'num_meses_var5_ult3',
 'ind_var30',
 'num_var4',
 'num_var5',
 'num_var30',
 'saldo_medio_var5_hace2']

## Train-Test Split

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(df_train, y_train, test_size=0.33, random_state=42)

In [11]:
originalFeatures = list(X_train.columns.values)
originalFeatures

['ID',
 'var3',
 'var15',
 'imp_ent_var16_ult1',
 'imp_op_var39_comer_ult1',
 'imp_op_var39_comer_ult3',
 'imp_op_var40_comer_ult1',
 'imp_op_var40_comer_ult3',
 'imp_op_var40_efect_ult1',
 'imp_op_var40_efect_ult3',
 'imp_op_var40_ult1',
 'imp_op_var41_comer_ult1',
 'imp_op_var41_comer_ult3',
 'imp_op_var41_efect_ult1',
 'imp_op_var41_efect_ult3',
 'imp_op_var41_ult1',
 'imp_op_var39_efect_ult1',
 'imp_op_var39_efect_ult3',
 'imp_op_var39_ult1',
 'imp_sal_var16_ult1',
 'ind_var1_0',
 'ind_var1',
 'ind_var2_0',
 'ind_var2',
 'ind_var5_0',
 'ind_var5',
 'ind_var6_0',
 'ind_var6',
 'ind_var8_0',
 'ind_var8',
 'ind_var12_0',
 'ind_var12',
 'ind_var13_0',
 'ind_var13_corto_0',
 'ind_var13_corto',
 'ind_var13_largo_0',
 'ind_var13_largo',
 'ind_var13_medio_0',
 'ind_var13_medio',
 'ind_var13',
 'ind_var14_0',
 'ind_var14',
 'ind_var17_0',
 'ind_var17',
 'ind_var18_0',
 'ind_var18',
 'ind_var19',
 'ind_var20_0',
 'ind_var20',
 'ind_var24_0',
 'ind_var24',
 'ind_var25_cte',
 'ind_var26_0',
 '

## Random Forest feature selection with Genetic Algorithm

In [12]:
rfc = RandomForestClassifier()
params = {'min_weight_fraction_leaf': Continuous(0.01, 0.5, distribution='log-uniform'),
          'bootstrap': Categorical([True, False]),
          'max_depth': Integer(2, 30),
          'max_leaf_nodes': Integer(2, 35),
          'n_estimators': Integer(100, 300)}

In [13]:
mutation_adapter = ExponentialAdapter(initial_value=0.8, end_value=0.2, adaptive_rate=0.1)
crossover_adapter = ExponentialAdapter(initial_value=0.2, end_value=0.8, adaptive_rate=0.1)

In [14]:
cv = StratifiedKFold(n_splits=3, shuffle=True)

In [15]:
evolved_estimator = GASearchCV(estimator=rfc,
                               cv=cv,
                               scoring='accuracy',
                               population_size=20,
                               generations=2,
                               mutation_probability=mutation_adapter,
                               crossover_probability=crossover_adapter,
                               param_grid=params,
                               n_jobs=-1)

In [16]:
rfc.fit(X_train, Y_train)

RandomForestClassifier()

In [17]:
RFfeatureImpValue = dict(zip(originalFeatures,rfc.feature_importances_))
RFfeatureImpValue = OrderedDict(sorted(RFfeatureImpValue.items(),key=operator.itemgetter(1),reverse=True))
rfcFullFeatureImportance = list(RFfeatureImpValue.keys())

In [18]:
RFfeatureImpValue

OrderedDict([('ID', 0.2455405357084641),
             ('var38', 0.20480214987717144),
             ('var15', 0.15301987118247),
             ('saldo_medio_var5_ult3', 0.02311917736617181),
             ('saldo_medio_var5_hace3', 0.021932113574790663),
             ('num_var45_ult3', 0.0191689577469822),
             ('num_var45_hace3', 0.015306776681034867),
             ('saldo_var30', 0.014883008276719224),
             ('saldo_medio_var5_hace2', 0.01401041438520931),
             ('num_var22_ult3', 0.013647862505574912),
             ('saldo_var42', 0.013198439475944148),
             ('num_var45_hace2', 0.012888602513410143),
             ('saldo_medio_var5_ult1', 0.012273564793723222),
             ('saldo_var5', 0.010946468270784222),
             ('num_var45_ult1', 0.009739657900870089),
             ('num_var22_hace3', 0.009237403251693931),
             ('num_med_var45_ult3', 0.009235604168771134),
             ('num_var22_hace2', 0.009024087340504574),
             ('var36', 

In [19]:
# Top 25 Features
rfcFeatureImportance25 = rfcFullFeatureImportance[:25]
print(rfcFeatureImportance25)

['ID', 'var38', 'var15', 'saldo_medio_var5_ult3', 'saldo_medio_var5_hace3', 'num_var45_ult3', 'num_var45_hace3', 'saldo_var30', 'saldo_medio_var5_hace2', 'num_var22_ult3', 'saldo_var42', 'num_var45_hace2', 'saldo_medio_var5_ult1', 'saldo_var5', 'num_var45_ult1', 'num_var22_hace3', 'num_med_var45_ult3', 'num_var22_hace2', 'var36', 'num_var22_ult1', 'num_meses_var39_vig_ult3', 'num_med_var22_ult3', 'num_meses_var5_ult3', 'imp_op_var41_ult1', 'num_var35']


In [20]:
# Top 50 Features
rfcFeatureImportance50 = rfcFullFeatureImportance[:50]
print(rfcFeatureImportance50)

['ID', 'var38', 'var15', 'saldo_medio_var5_ult3', 'saldo_medio_var5_hace3', 'num_var45_ult3', 'num_var45_hace3', 'saldo_var30', 'saldo_medio_var5_hace2', 'num_var22_ult3', 'saldo_var42', 'num_var45_hace2', 'saldo_medio_var5_ult1', 'saldo_var5', 'num_var45_ult1', 'num_var22_hace3', 'num_med_var45_ult3', 'num_var22_hace2', 'var36', 'num_var22_ult1', 'num_meses_var39_vig_ult3', 'num_med_var22_ult3', 'num_meses_var5_ult3', 'imp_op_var41_ult1', 'num_var35', 'imp_op_var39_ult1', 'imp_op_var41_comer_ult3', 'imp_op_var41_efect_ult3', 'imp_op_var39_comer_ult3', 'imp_op_var39_efect_ult3', 'num_var4', 'imp_trans_var37_ult1', 'num_op_var41_ult3', 'imp_op_var41_comer_ult1', 'var3', 'imp_op_var39_comer_ult1', 'num_var30', 'num_op_var39_ult3', 'saldo_var37', 'imp_ent_var16_ult1', 'imp_op_var41_efect_ult1', 'imp_op_var39_efect_ult1', 'num_op_var39_comer_ult3', 'num_op_var41_comer_ult3', 'num_op_var39_hace2', 'num_op_var39_ult1', 'num_op_var41_ult1', 'num_op_var41_hace2', 'imp_var43_emit_ult1', 'num_op

## Random Forest with top 25 Features

In [21]:
rfk25_train = pd.DataFrame()
for i in rfcFeatureImportance25:
  rfk25_train[i] = df_train[i]
rfk25_train

Unnamed: 0,ID,var38,var15,saldo_medio_var5_ult3,saldo_medio_var5_hace3,num_var45_ult3,num_var45_hace3,saldo_var30,saldo_medio_var5_hace2,num_var22_ult3,...,num_var22_hace3,num_med_var45_ult3,num_var22_hace2,var36,num_var22_ult1,num_meses_var39_vig_ult3,num_med_var22_ult3,num_meses_var5_ult3,imp_op_var41_ult1,num_var35
0,1,39205.170,23,0.00,0.00,0,0,0.00,0.00,0,...,0,0,0,99,0,2,0,0,0.0,0
1,3,49278.030,34,0.00,88.89,0,0,300.00,0.00,0,...,0,0,0,3,0,2,0,1,0.0,3
2,4,67333.770,23,2.07,0.18,0,0,3.00,3.00,0,...,0,0,0,99,0,1,0,3,0.0,3
3,8,64007.970,37,138.84,0.00,48,3,70.62,186.09,3,...,0,15,3,2,0,1,0,2,195.0,9
4,10,117310.979,39,13501.47,0.30,0,0,135003.00,3.00,9,...,0,0,3,1,6,2,3,3,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,151829,60926.490,48,0.00,0.00,0,0,0.00,0.00,0,...,0,0,0,99,0,2,0,0,0.0,0
76016,151830,118634.520,39,0.00,0.00,48,3,48191.22,130.65,24,...,0,15,24,2,0,2,6,1,0.0,3
76017,151835,74028.150,23,3.00,0.00,0,0,3.00,3.00,0,...,0,0,0,99,0,1,0,2,0.0,3
76018,151836,84278.160,25,2.58,1.74,0,0,3.00,3.00,0,...,0,0,0,99,0,2,0,3,0.0,3


In [22]:
RF25_X_train, RF25_X_test, RF25_Y_train, RF25_Y_test = train_test_split(rfk25_train, y_train, test_size=0.33, random_state=42)

In [23]:
evolved_estimator.fit(RF25_X_train, RF25_Y_train)
print('Best Parameters:',evolved_estimator.best_params_)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.960399	0          	0.960399   	0.960399   
1  	40    	0.960399	0          	0.960399   	0.960399   
2  	40    	0.960399	0          	0.960399   	0.960399   
Best Parameters: {'min_weight_fraction_leaf': 0.040629958355022186, 'bootstrap': True, 'max_depth': 17, 'max_leaf_nodes': 21, 'n_estimators': 300}


In [24]:
print("Stats achieved in each generation: ", evolved_estimator.history)
print("Best k solutions: ", evolved_estimator.hof)

Stats achieved in each generation:  {'gen': [0, 1, 2], 'fitness': [0.9603989558455023, 0.9603989558455023, 0.9603989558455023], 'fitness_std': [0.0, 0.0, 0.0], 'fitness_max': [0.9603989558455023, 0.9603989558455023, 0.9603989558455023], 'fitness_min': [0.9603989558455023, 0.9603989558455023, 0.9603989558455023]}
Best k solutions:  {0: {'min_weight_fraction_leaf': 0.040629958355022186, 'bootstrap': True, 'max_depth': 17, 'max_leaf_nodes': 21, 'n_estimators': 300}}


In [25]:
rfk25_bestParams = evolved_estimator.best_params_
rfk25_bestParams

{'min_weight_fraction_leaf': 0.040629958355022186,
 'bootstrap': True,
 'max_depth': 17,
 'max_leaf_nodes': 21,
 'n_estimators': 300}

In [26]:
rfk25 = RandomForestClassifier(**rfk25_bestParams)

In [27]:
rfk25.fit(RF25_X_train, RF25_Y_train)

RandomForestClassifier(max_depth=17, max_leaf_nodes=21,
                       min_weight_fraction_leaf=0.040629958355022186,
                       n_estimators=300)

In [28]:
# Train Accuracy

RF25_X_Pred = rfk25.predict(RF25_X_train)
print(accuracy_score(RF25_Y_train, RF25_X_Pred))

0.9603989554905464


In [29]:
# Test Accuracy

RF25_Y_Pred = rfk25.predict(RF25_X_test)
print(accuracy_score(RF25_Y_test, RF25_Y_Pred))

0.9604974688085463


## Random Forest with top 50 Features

In [30]:
rfk50_train = pd.DataFrame()
for i in rfcFeatureImportance50:
  rfk50_train[i] = df_train[i]
rfk50_train

Unnamed: 0,ID,var38,var15,saldo_medio_var5_ult3,saldo_medio_var5_hace3,num_var45_ult3,num_var45_hace3,saldo_var30,saldo_medio_var5_hace2,num_var22_ult3,...,imp_op_var41_efect_ult1,imp_op_var39_efect_ult1,num_op_var39_comer_ult3,num_op_var41_comer_ult3,num_op_var39_hace2,num_op_var39_ult1,num_op_var41_ult1,num_op_var41_hace2,imp_var43_emit_ult1,num_op_var41_efect_ult3
0,1,39205.170,23,0.00,0.00,0,0,0.00,0.00,0,...,0.0,0.0,0,0,0,0,0,0,0.0,0
1,3,49278.030,34,0.00,88.89,0,0,300.00,0.00,0,...,0.0,0.0,0,0,0,0,0,0,0.0,0
2,4,67333.770,23,2.07,0.18,0,0,3.00,3.00,0,...,0.0,0.0,0,0,0,0,0,0,0.0,0
3,8,64007.970,37,138.84,0.00,48,3,70.62,186.09,3,...,0.0,0.0,9,9,0,9,9,0,0.0,0
4,10,117310.979,39,13501.47,0.30,0,0,135003.00,3.00,9,...,0.0,0.0,0,0,0,0,0,0,135003.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,151829,60926.490,48,0.00,0.00,0,0,0.00,0.00,0,...,0.0,0.0,0,0,0,0,0,0,0.0,0
76016,151830,118634.520,39,0.00,0.00,48,3,48191.22,130.65,24,...,0.0,0.0,0,0,0,0,0,0,0.0,0
76017,151835,74028.150,23,3.00,0.00,0,0,3.00,3.00,0,...,0.0,0.0,0,0,0,0,0,0,0.0,0
76018,151836,84278.160,25,2.58,1.74,0,0,3.00,3.00,0,...,0.0,0.0,0,0,0,0,0,0,0.0,0


In [31]:
RF50_X_train, RF50_X_test, RF50_Y_train, RF50_Y_test = train_test_split(rfk50_train, y_train, test_size=0.33, random_state=42)

In [32]:
evolved_estimator.fit(RF50_X_train, RF50_Y_train)
print('Best Parameters:',evolved_estimator.best_params_)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.960399	0          	0.960399   	0.960399   
1  	40    	0.960399	0          	0.960399   	0.960399   
2  	40    	0.960399	0          	0.960399   	0.960399   
Best Parameters: {'min_weight_fraction_leaf': 0.17720174580896128, 'bootstrap': True, 'max_depth': 29, 'max_leaf_nodes': 30, 'n_estimators': 287}


In [33]:
print("Stats achieved in each generation: ", evolved_estimator.history)
print("Best k solutions: ", evolved_estimator.hof)

Stats achieved in each generation:  {'gen': [0, 1, 2], 'fitness': [0.9603989558455023, 0.9603989558455023, 0.9603989558455023], 'fitness_std': [0.0, 0.0, 0.0], 'fitness_max': [0.9603989558455023, 0.9603989558455023, 0.9603989558455023], 'fitness_min': [0.9603989558455023, 0.9603989558455023, 0.9603989558455023]}
Best k solutions:  {0: {'min_weight_fraction_leaf': 0.17720174580896128, 'bootstrap': True, 'max_depth': 29, 'max_leaf_nodes': 30, 'n_estimators': 287}}


In [34]:
rfk50_bestParams = evolved_estimator.best_params_
rfk50_bestParams

{'min_weight_fraction_leaf': 0.17720174580896128,
 'bootstrap': True,
 'max_depth': 29,
 'max_leaf_nodes': 30,
 'n_estimators': 287}

In [35]:
rfk50 = RandomForestClassifier(**rfk50_bestParams)

In [36]:
rfk50.fit(RF50_X_train, RF50_Y_train)

RandomForestClassifier(max_depth=29, max_leaf_nodes=30,
                       min_weight_fraction_leaf=0.17720174580896128,
                       n_estimators=287)

In [37]:
# Train Accuracy

RF50_X_Pred = rfk50.predict(RF50_X_train)
print(accuracy_score(RF50_Y_train, RF50_X_Pred))

0.9603989554905464


In [38]:
# Test Accuracy

RF50_Y_Pred = rfk50.predict(RF50_X_test)
print(accuracy_score(RF50_Y_test, RF50_Y_Pred))

0.9604974688085463


## Decision Tree feature selection with Genetic Algorithm

In [39]:
dtc = DecisionTreeClassifier()
params = {
    "criterion": Categorical(['gini','entropy']),
    "max_depth": Integer(1,10),
    "min_samples_split": Integer(1,10),
    "min_samples_leaf": Integer(1,5),
}

In [40]:
mutation_adapter = ExponentialAdapter(initial_value=0.8, end_value=0.2, adaptive_rate=0.1)
crossover_adapter = ExponentialAdapter(initial_value=0.2, end_value=0.8, adaptive_rate=0.1)

In [41]:
cv = StratifiedKFold(n_splits=3, shuffle=True)

In [42]:
evolved_estimator1 = GASearchCV(estimator=dtc,
                               cv=cv,
                               scoring='accuracy',
                               population_size=15,
                               generations=2,
                               mutation_probability=mutation_adapter,
                               crossover_probability=crossover_adapter,
                               param_grid=params,
                               n_jobs=-1)

In [43]:
dtc.fit(X_train, Y_train)

DecisionTreeClassifier()

In [44]:
DTfeatureImpValue = dict(zip(originalFeatures,dtc.feature_importances_))
DTfeatureImpValue = dict(sorted(DTfeatureImpValue.items(),key=operator.itemgetter(1),reverse=True))
dtcFullFeatureImportance = list(DTfeatureImpValue.keys())

In [45]:
# Top 25 Features
dtcFeatureImportance25 = dtcFullFeatureImportance[:25]
print(dtcFeatureImportance25)

['ID', 'var38', 'var15', 'saldo_var30', 'saldo_medio_var5_ult3', 'saldo_medio_var5_hace3', 'num_var45_ult3', 'saldo_medio_var5_hace2', 'num_var45_hace3', 'num_var45_hace2', 'num_var22_ult3', 'num_var22_hace3', 'num_var45_ult1', 'num_meses_var39_vig_ult3', 'num_var22_hace2', 'var36', 'saldo_var5', 'saldo_var42', 'saldo_medio_var5_ult1', 'num_med_var45_ult3', 'num_var41_0', 'var3', 'imp_op_var39_efect_ult1', 'num_var39_0', 'num_var22_ult1']


In [46]:
# Top 50 Features
dtcFeatureImportance50 = dtcFullFeatureImportance[:50]
print(dtcFeatureImportance50)

['ID', 'var38', 'var15', 'saldo_var30', 'saldo_medio_var5_ult3', 'saldo_medio_var5_hace3', 'num_var45_ult3', 'saldo_medio_var5_hace2', 'num_var45_hace3', 'num_var45_hace2', 'num_var22_ult3', 'num_var22_hace3', 'num_var45_ult1', 'num_meses_var39_vig_ult3', 'num_var22_hace2', 'var36', 'saldo_var5', 'saldo_var42', 'saldo_medio_var5_ult1', 'num_med_var45_ult3', 'num_var41_0', 'var3', 'imp_op_var39_efect_ult1', 'num_var39_0', 'num_var22_ult1', 'num_op_var39_ult1', 'imp_var43_emit_ult1', 'saldo_medio_var8_ult1', 'imp_op_var39_comer_ult3', 'ind_var41_0', 'imp_ent_var16_ult1', 'saldo_var37', 'ind_var39_0', 'num_op_var39_hace2', 'num_op_var39_comer_ult1', 'imp_op_var39_comer_ult1', 'ind_var8_0', 'imp_trans_var37_ult1', 'num_var37_med_ult2', 'num_meses_var5_ult3', 'imp_op_var41_efect_ult3', 'num_med_var22_ult3', 'imp_op_var39_efect_ult3', 'imp_op_var41_efect_ult1', 'imp_op_var41_comer_ult1', 'num_var5_0', 'imp_op_var39_ult1', 'num_var35', 'num_op_var41_hace2', 'num_meses_var8_ult3']


## Decision Tree with top 25 Features

In [187]:
dtc25_train = pd.DataFrame()
for i in dtcFeatureImportance25:
  dtc25_train[i] = df_train[i]

In [48]:
DT25_X_train, DT25_X_test, DT25_Y_train, DT25_Y_test = train_test_split(dtc25_train, y_train, test_size=0.33, random_state=42)

In [49]:
evolved_estimator1.fit(DT25_X_train, DT25_Y_train)
print('Best Parameters:',evolved_estimator1.best_params_)

gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	15    	nan    	nan        	nan        	nan        
1  	30    	nan    	nan        	nan        	nan        
2  	30    	nan    	nan        	nan        	nan        
Best Parameters: {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 4}


In [50]:
print("Stats achieved in each generation: ", evolved_estimator1.history)
print("Best k solutions: ", evolved_estimator1.hof)

Stats achieved in each generation:  {'gen': [0, 1, 2], 'fitness': [nan, nan, nan], 'fitness_std': [nan, nan, nan], 'fitness_max': [nan, nan, nan], 'fitness_min': [nan, nan, nan]}
Best k solutions:  {0: {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 4}}


In [51]:
dtk25_bestParams = evolved_estimator1.best_params_
dtk25_bestParams

{'criterion': 'gini',
 'max_depth': 4,
 'min_samples_split': 5,
 'min_samples_leaf': 4}

In [52]:
dtc25 = DecisionTreeClassifier(**dtk25_bestParams)

In [53]:
dtc25.fit(DT25_X_train, DT25_Y_train)

DecisionTreeClassifier(max_depth=4, min_samples_leaf=4, min_samples_split=5)

In [54]:
# Train Accuracy

DT25_X_Pred = dtc25.predict(DT25_X_train)
print(accuracy_score(DT25_Y_train, DT25_X_Pred))

0.9604578563995838


In [55]:
# Test Accuracy

DT25_Y_Pred = dtc25.predict(DT25_X_test)
print(accuracy_score(DT25_Y_test, DT25_Y_Pred))

0.9605373300912823


## Decision Tree with top 50 Features

In [188]:
dtc50_train = pd.DataFrame()
for i in dtcFeatureImportance50:
  dtc50_train[i] = df_train[i]

In [57]:
DT50_X_train, DT50_X_test, DT50_Y_train, DT50_Y_test = train_test_split(dtc50_train, y_train, test_size=0.33, random_state=42)

In [58]:
evolved_estimator1.fit(DT50_X_train, DT50_Y_train)
print('Best Parameters:',evolved_estimator1.best_params_)

gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	15    	0.95993	0.00109557 	0.960399   	0.95606    
1  	30    	0.960399	2.88472e-10	0.960399   	0.960399   
2  	30    	0.960399	1.11022e-16	0.960399   	0.960399   
Best Parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 5}


In [59]:
print("Stats achieved in each generation: ", evolved_estimator1.history)
print("Best k solutions: ", evolved_estimator1.hof)

Stats achieved in each generation:  {'gen': [0, 1, 2], 'fitness': [0.9599303674189875, 0.9603989559225996, 0.9603989558455022], 'fitness_std': [0.0010955686987732711, 2.8847223186370636e-10, 1.1102230246251565e-16], 'fitness_max': [0.960398957001964, 0.960398957001964, 0.9603989558455023], 'fitness_min': [0.9560599299194829, 0.9603989558455023, 0.9603989558455023]}
Best k solutions:  {0: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 5}}


In [60]:
dtk50_bestParams = evolved_estimator1.best_params_
dtk50_bestParams

{'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_split': 7,
 'min_samples_leaf': 5}

In [61]:
dtc50 = DecisionTreeClassifier(**dtk50_bestParams)

In [62]:
dtc50.fit(DT50_X_train, DT50_Y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5,
                       min_samples_split=7)

In [63]:
# Train Accuracy

DT50_X_Pred = dtc50.predict(DT50_X_train)
print(accuracy_score(DT50_Y_train, DT50_X_Pred))

0.9604185891268922


In [64]:
# Test Accuracy

DT50_Y_Pred = dtc50.predict(DT50_X_test)
print(accuracy_score(DT50_Y_test, DT50_Y_Pred))

0.9603380236776019


## Select K-Best Feature selection k=25

In [65]:
selector = SelectKBest(score_func=f_classif, k=25)
Selectk25_train = selector.fit_transform(df_train, y_train)

In [66]:
Selectk25_train.shape

(76020, 25)

In [67]:
selected_featuresk25 = list(df_train.columns[(selector.get_support())])
selected_featuresk25

['var15',
 'ind_var5',
 'ind_var8_0',
 'ind_var12_0',
 'ind_var12',
 'ind_var13_0',
 'ind_var13_corto_0',
 'ind_var13_corto',
 'ind_var13',
 'ind_var30',
 'ind_var39_0',
 'num_var4',
 'num_var5',
 'num_var8_0',
 'num_var13_0',
 'num_var13_corto_0',
 'num_var13_corto',
 'num_var13',
 'num_var30',
 'num_var35',
 'num_var42',
 'saldo_var30',
 'var36',
 'num_meses_var5_ult3',
 'num_meses_var13_corto_ult3']

## Select K-Best Feature selection k=50

In [68]:
selector = SelectKBest(score_func=f_classif, k=50)
Selectk50_train = selector.fit_transform(df_train, y_train)

In [69]:
Selectk50_train.shape

(76020, 50)

In [70]:
selected_featuresk50 = list(df_train.columns[(selector.get_support())])
selected_featuresk50

['var15',
 'imp_op_var41_efect_ult1',
 'imp_op_var41_ult1',
 'imp_op_var39_efect_ult1',
 'imp_op_var39_ult1',
 'ind_var5_0',
 'ind_var5',
 'ind_var8_0',
 'ind_var8',
 'ind_var12_0',
 'ind_var12',
 'ind_var13_0',
 'ind_var13_corto_0',
 'ind_var13_corto',
 'ind_var13',
 'ind_var24_0',
 'ind_var24',
 'ind_var30',
 'ind_var39_0',
 'ind_var41_0',
 'num_var4',
 'num_var5_0',
 'num_var5',
 'num_var8_0',
 'num_var8',
 'num_var12_0',
 'num_var12',
 'num_var13_0',
 'num_var13_corto_0',
 'num_var13_corto',
 'num_var13',
 'num_var24_0',
 'num_var24',
 'num_var30_0',
 'num_var30',
 'num_var35',
 'num_var39_0',
 'num_var41_0',
 'num_var42',
 'saldo_var13_corto',
 'saldo_var13',
 'saldo_var30',
 'var36',
 'num_aport_var13_hace3',
 'num_var22_ult1',
 'num_meses_var5_ult3',
 'num_meses_var8_ult3',
 'num_meses_var12_ult3',
 'num_meses_var13_corto_ult3',
 'saldo_medio_var13_corto_ult1']

## Common Features between Random Forest, Decision Tree and Select K-Best where k=25

In [71]:
# rfcFeatureImportance25, dtk25_bestParams, selected_featuresk25
commonFeatures25 = list( set(rfcFeatureImportance25) | set(dtcFeatureImportance25) | set(selected_featuresk25))
print(commonFeatures25)
print(len(commonFeatures25))

['num_var8_0', 'num_var13', 'num_med_var22_ult3', 'imp_op_var39_efect_ult1', 'num_var22_ult3', 'var36', 'num_var22_hace3', 'num_var45_hace2', 'ind_var13', 'num_var13_0', 'num_meses_var39_vig_ult3', 'num_var41_0', 'saldo_medio_var5_hace3', 'ind_var13_corto', 'ind_var13_0', 'num_var22_ult1', 'ind_var39_0', 'num_meses_var13_corto_ult3', 'ind_var12_0', 'saldo_var30', 'saldo_medio_var5_ult1', 'ind_var8_0', 'var3', 'num_var39_0', 'ind_var12', 'saldo_var5', 'num_var45_ult1', 'num_var45_ult3', 'var15', 'ID', 'var38', 'saldo_medio_var5_hace2', 'num_var4', 'ind_var30', 'num_var30', 'ind_var13_corto_0', 'num_var13_corto', 'ind_var5', 'num_var35', 'num_var42', 'num_var22_hace2', 'saldo_var42', 'num_var13_corto_0', 'saldo_medio_var5_ult3', 'num_meses_var5_ult3', 'imp_op_var41_ult1', 'num_var5', 'num_med_var45_ult3', 'num_var45_hace3']
49


In [90]:
commonData25 = pd.DataFrame()
commonTest25 = pd.DataFrame()
for i in commonFeatures25:
  commonData25[i] = df_train[i]
for i in commonFeatures25:
  commonTest25[i] = df_test[i]
commonData25

Unnamed: 0,num_var8_0,num_var13,num_med_var22_ult3,imp_op_var39_efect_ult1,num_var22_ult3,var36,num_var22_hace3,num_var45_hace2,ind_var13,num_var13_0,...,num_var42,num_var22_hace2,saldo_var42,num_var13_corto_0,saldo_medio_var5_ult3,num_meses_var5_ult3,imp_op_var41_ult1,num_var5,num_med_var45_ult3,num_var45_hace3
0,0,0,0,0.0,0,99,0,0,0,0,...,0,0,0.00,0,0.00,0,0.0,0,0,0
1,0,3,0,0.0,0,3,0,0,1,3,...,0,0,0.00,3,0.00,1,0.0,0,0,0
2,0,0,0,0.0,0,99,0,0,0,0,...,3,0,3.00,0,2.07,3,0.0,3,0,0
3,0,0,0,0.0,3,2,0,27,0,0,...,3,3,70.62,0,138.84,2,195.0,3,15,3
4,0,0,3,0.0,9,1,0,0,0,0,...,3,3,135003.00,0,13501.47,3,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,0,0,0,0.0,0,99,0,0,0,0,...,0,0,0.00,0,0.00,0,0.0,0,0,0
76016,0,0,6,0.0,24,2,0,36,0,0,...,3,24,48191.22,0,0.00,1,0.0,0,15,3
76017,0,0,0,0.0,0,99,0,0,0,0,...,3,0,3.00,0,3.00,2,0.0,3,0,0
76018,0,0,0,0.0,0,99,0,0,0,0,...,3,0,3.00,0,2.58,3,0.0,3,0,0


In [91]:
commonTest25

Unnamed: 0,num_var8_0,num_var13,num_med_var22_ult3,imp_op_var39_efect_ult1,num_var22_ult3,var36,num_var22_hace3,num_var45_hace2,ind_var13,num_var13_0,...,num_var42,num_var22_hace2,saldo_var42,num_var13_corto_0,saldo_medio_var5_ult3,num_meses_var5_ult3,imp_op_var41_ult1,num_var5,num_med_var45_ult3,num_var45_hace3
0,0,0,0,0.0,3,3,3,0,0,0,...,3,0,6.0,0,4.80,3,0.0,3,0,0
1,0,0,0,0.0,3,3,3,0,0,0,...,3,0,3.0,0,2.85,3,0.0,3,0,6
2,0,0,0,60.0,0,3,0,3,0,0,...,3,0,30.0,0,66.15,3,60.0,3,3,6
3,0,0,0,0.0,0,99,0,0,0,0,...,0,0,0.0,0,0.00,0,0.0,0,0,0
4,0,0,0,0.0,0,3,0,0,0,0,...,3,0,30.0,0,21.30,3,0.0,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75813,0,0,0,0.0,0,99,0,0,0,0,...,0,0,0.0,0,0.00,0,0.0,0,0,6
75814,0,0,0,0.0,0,3,0,0,0,0,...,3,0,3.0,0,2.40,3,0.0,3,0,0
75815,0,0,0,0.0,0,99,0,0,0,0,...,3,0,90.0,0,69.69,3,0.0,3,0,0
75816,0,0,0,0.0,0,3,0,0,0,0,...,3,0,3.0,0,2.64,3,0.0,3,0,0


In [76]:
Common25_X_train, Common25_X_test, Common25_Y_train, Common25_Y_test = train_test_split(commonData25, y_train, test_size=0.33, random_state=42)

## Random Forest on common features where k=25

In [77]:
evolved_estimator.fit(Common25_X_train, Common25_Y_train)
print('Best Parameters:',evolved_estimator.best_params_)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.960399	0          	0.960399   	0.960399   
1  	40    	0.960399	0          	0.960399   	0.960399   
2  	40    	0.960399	0          	0.960399   	0.960399   
Best Parameters: {'min_weight_fraction_leaf': 0.04191075201124449, 'bootstrap': False, 'max_depth': 11, 'max_leaf_nodes': 3, 'n_estimators': 152}


In [78]:
RFcommonk25_bestParams = evolved_estimator.best_params_
RFcommonk25_bestParams

{'min_weight_fraction_leaf': 0.04191075201124449,
 'bootstrap': False,
 'max_depth': 11,
 'max_leaf_nodes': 3,
 'n_estimators': 152}

In [79]:
RFcommonk25 = RandomForestClassifier(**RFcommonk25_bestParams)

In [80]:
RFcommonk25.fit(Common25_X_train, Common25_Y_train)

RandomForestClassifier(bootstrap=False, max_depth=11, max_leaf_nodes=3,
                       min_weight_fraction_leaf=0.04191075201124449,
                       n_estimators=152)

In [81]:
# Train Accuracy

RFCommon25_X_Pred = RFcommonk25.predict(Common25_X_train)
print(accuracy_score(Common25_Y_train, RFCommon25_X_Pred))

0.9603989554905464


In [82]:
# Test Accuracy

RFCommon25_Y_Pred = RFcommonk25.predict(Common25_X_test)
print(accuracy_score(Common25_Y_test, RFCommon25_Y_Pred))

0.9604974688085463


In [103]:
RF25Preds = RFcommonk25.predict(commonTest25)
Pred_clf = []

for vals in RF25Preds:
    Pred_clf.append(vals)

    
Pred_clf = pd.DataFrame(Pred_clf,columns=['TARGET'])
Pred_clf['ID'] = df_test['ID']
Pred_clf = Pred_clf[['ID', 'TARGET']]
Pred_clf.to_csv("/content/drive/MyDrive/A4-2/RFCommon25_Y_Pred.csv", index=False)

## Decision Tree on common features where k=25

In [104]:
evolved_estimator.fit(Common25_X_train, Common25_Y_train)
print('Best Parameters:',evolved_estimator.best_params_)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.960399	0          	0.960399   	0.960399   
1  	40    	0.960399	0          	0.960399   	0.960399   
2  	40    	0.960399	0          	0.960399   	0.960399   
Best Parameters: {'min_weight_fraction_leaf': 0.02610968807109814, 'bootstrap': True, 'max_depth': 5, 'max_leaf_nodes': 9, 'n_estimators': 129}


In [105]:
DTcommonk25_bestParams = evolved_estimator.best_params_
DTcommonk25_bestParams.pop('bootstrap')
DTcommonk25_bestParams

{'min_weight_fraction_leaf': 0.02610968807109814,
 'max_depth': 5,
 'max_leaf_nodes': 9,
 'n_estimators': 129}

In [107]:
DTcommonk25_bestParams.pop('n_estimators')
DTcommonk25 = DecisionTreeClassifier(**DTcommonk25_bestParams)

In [108]:
DTcommonk25.fit(Common25_X_train, Common25_Y_train)

DecisionTreeClassifier(max_depth=5, max_leaf_nodes=9,
                       min_weight_fraction_leaf=0.02610968807109814)

In [109]:
# Train Accuracy

DTCommon25_X_Pred = DTcommonk25.predict(Common25_X_train)
print(accuracy_score(Common25_Y_train, DTCommon25_X_Pred))

0.9603989554905464


In [110]:
# Test Accuracy

DTCommon25_Y_Pred = DTcommonk25.predict(Common25_X_test)
print(accuracy_score(Common25_Y_test, DTCommon25_Y_Pred))

0.9604974688085463


In [112]:
DT25Preds = DTcommonk25.predict(commonTest25)
Pred_clf = []

for vals in DT25Preds:
    Pred_clf.append(vals)

    
Pred_clf = pd.DataFrame(Pred_clf,columns=['TARGET'])
Pred_clf['ID'] = df_test['ID']
Pred_clf = Pred_clf[['ID', 'TARGET']]

Pred_clf.to_csv("/content/drive/MyDrive/A4-2/DTCommon25_Y_Pred.csv", index=False)

## Common Features between Random Forest, Decision Tree and Select K-Best where k=50

In [113]:
# rfcFeatureImportance50, dtcFeatureImportance50, selected_featuresk50
commonFeatures50 = list( set(rfcFeatureImportance50) | set(dtcFeatureImportance50) | set(selected_featuresk50))
print(commonFeatures50)
print(len(commonFeatures50))

['num_var8_0', 'num_var5_0', 'num_var22_ult3', 'var36', 'num_var45_hace2', 'ind_var13', 'num_op_var41_ult1', 'num_meses_var39_vig_ult3', 'num_var12', 'num_op_var39_ult1', 'num_op_var39_hace2', 'saldo_medio_var5_hace3', 'ind_var13_corto', 'ind_var13_0', 'num_var22_ult1', 'ind_var39_0', 'saldo_var13_corto', 'saldo_var30', 'imp_op_var41_efect_ult1', 'num_var39_0', 'num_var45_ult1', 'num_op_var41_efect_ult3', 'var38', 'saldo_medio_var5_hace2', 'ind_var30', 'num_var30', 'ind_var13_corto_0', 'num_var13_corto', 'ind_var5', 'ind_var5_0', 'imp_var43_emit_ult1', 'num_var42', 'num_var22_hace2', 'saldo_var42', 'num_var13_corto_0', 'saldo_medio_var5_ult3', 'ind_var41_0', 'num_meses_var5_ult3', 'num_op_var41_comer_ult3', 'num_var13', 'num_var12_0', 'num_var5', 'num_med_var45_ult3', 'num_var24', 'imp_op_var41_comer_ult3', 'num_med_var22_ult3', 'saldo_var37', 'imp_op_var39_efect_ult1', 'saldo_medio_var13_corto_ult1', 'imp_op_var39_ult1', 'num_var22_hace3', 'saldo_var13', 'num_var13_0', 'num_var41_0', 

In [123]:
commonData50 = pd.DataFrame()
commonTest50 = pd.DataFrame()
for i in commonFeatures50:
  commonData50[i] = df_train[i]
for i in commonFeatures50:
  commonTest50[i] = df_test[i]
commonData50

Unnamed: 0,num_var8_0,num_var5_0,num_var22_ult3,var36,num_var45_hace2,ind_var13,num_op_var41_ult1,num_meses_var39_vig_ult3,num_var12,num_op_var39_ult1,...,num_aport_var13_hace3,num_var35,imp_op_var41_comer_ult1,saldo_medio_var8_ult1,num_op_var41_ult3,imp_op_var41_ult1,imp_trans_var37_ult1,num_var37_med_ult2,num_op_var39_comer_ult1,num_var45_hace3
0,0,3,0,99,0,0,0,2,0,0,...,0,0,0.0,0.0,0,0.0,0.0,0,0,0
1,0,3,0,3,0,1,0,2,0,0,...,3,3,0.0,0.0,0,0.0,0.0,0,0,0
2,0,3,0,99,0,0,0,1,0,0,...,0,3,0.0,0.0,0,0.0,0.0,0,0,0
3,0,3,3,2,27,0,9,1,0,9,...,0,9,195.0,0.0,9,195.0,0.0,6,9,3
4,0,3,9,1,0,0,0,2,3,0,...,0,3,0.0,0.0,0,0.0,270003.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,0,3,0,99,0,0,0,2,0,0,...,0,0,0.0,0.0,0,0.0,0.0,0,0,0
76016,0,3,24,2,36,0,0,2,3,0,...,0,3,0.0,0.0,0,0.0,0.0,0,0,3
76017,0,3,0,99,0,0,0,1,0,0,...,0,3,0.0,0.0,0,0.0,0.0,0,0,0
76018,0,3,0,99,0,0,0,2,0,0,...,0,3,0.0,0.0,0,0.0,0.0,0,0,0


In [124]:
Common50_X_train, Common50_X_test, Common50_Y_train, Common50_Y_test = train_test_split(commonData50, y_train, test_size=0.33, random_state=42)

## Random Forest on common features where k=50

In [116]:
evolved_estimator.fit(Common50_X_train, Common50_Y_train)
print('Best Parameters:',evolved_estimator.best_params_)

gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.960399	0          	0.960399   	0.960399   
1  	40    	0.960399	0          	0.960399   	0.960399   
2  	40    	0.960399	0          	0.960399   	0.960399   
Best Parameters: {'min_weight_fraction_leaf': 0.02119155124615207, 'bootstrap': True, 'max_depth': 13, 'max_leaf_nodes': 12, 'n_estimators': 232}


In [117]:
RFcommonk50_bestParams = evolved_estimator.best_params_
RFcommonk50_bestParams

{'min_weight_fraction_leaf': 0.02119155124615207,
 'bootstrap': True,
 'max_depth': 13,
 'max_leaf_nodes': 12,
 'n_estimators': 232}

In [118]:
RFcommonk50 = RandomForestClassifier(**RFcommonk50_bestParams)

In [119]:
RFcommonk50.fit(Common50_X_train, Common50_Y_train)

RandomForestClassifier(max_depth=13, max_leaf_nodes=12,
                       min_weight_fraction_leaf=0.02119155124615207,
                       n_estimators=232)

In [120]:
# Train Accuracy

RFCommon50_X_Pred = RFcommonk50.predict(Common50_X_train)
print(accuracy_score(Common50_Y_train, RFCommon50_X_Pred))

0.9603989554905464


In [121]:
# Test Accuracy

RFCommon50_Y_Pred = RFcommonk50.predict(Common50_X_test)
print(accuracy_score(Common50_Y_test, RFCommon50_Y_Pred))

0.9604974688085463


In [126]:
RF50Preds = RFcommonk50.predict(commonTest50)
Pred_clf = []

for vals in RF50Preds:
    Pred_clf.append(vals)

    
Pred_clf = pd.DataFrame(Pred_clf,columns=['TARGET'])
Pred_clf['ID'] = df_test['ID']
Pred_clf = Pred_clf[['ID', 'TARGET']]

Pred_clf.to_csv("/content/drive/MyDrive/A4-2/RFCommon50_Y_Pred.csv", index=False)

## Decision Tree on common features where k=50

In [None]:
evolved_estimator1.fit(Common50_X_train, Common50_Y_train)
print('Best Parameters:',evolved_estimator1.best_params_)

In [129]:
DTcommonk50_bestParams = evolved_estimator1.best_params_
DTcommonk50_bestParams

{'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_split': 7,
 'min_samples_leaf': 5}

In [130]:
DTcommonk50 = DecisionTreeClassifier(**DTcommonk50_bestParams)

In [131]:
DTcommonk50.fit(Common50_X_train, Common50_Y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5,
                       min_samples_split=7)

In [132]:
# Train Accuracy

DTCommon50_X_Pred = DTcommonk50.predict(Common50_X_train)
print(accuracy_score(Common50_Y_train, DTCommon50_X_Pred))

0.9604185891268922


In [133]:
# Test Accuracy

DTCommon50_Y_Pred = DTcommonk50.predict(Common50_X_test)
print(accuracy_score(Common50_Y_test, DTCommon50_Y_Pred))

0.9603380236776019


In [134]:
DT50Preds = DTcommonk50.predict(commonTest50)
Pred_clf = []

for vals in DT50Preds:
    Pred_clf.append(vals)
   
Pred_clf = pd.DataFrame(Pred_clf,columns=['TARGET'])
Pred_clf['ID'] = df_test['ID']
Pred_clf = Pred_clf[['ID', 'TARGET']]

Pred_clf.to_csv("/content/drive/MyDrive/A4-2/DTCommon50_Y_Pred.csv", index=False)