In [27]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from feature_engine.selection import DropCorrelatedFeatures
# possible change to pytorch  
from sklearn.neural_network import MLPClassifier

In [34]:
def loadData(path):
    df= pd.read_csv(path)
    return pd.DataFrame(df)


In [65]:
def dropDuplicatedRowAndColumn(data):
    data = data.drop_duplicates()
    return data.T.drop_duplicates().T


In [99]:
def quasiConstantRemoval(data, threshold):
    constant_filter = VarianceThreshold(threshold= threshold)
    constant_filter.fit(data)
    return pd.DataFrame(constant_filter.transform(data))  

In [100]:
def dropCorrelatedFeatures(data):
    drop_correlated = DropCorrelatedFeatures(
    variables=None, method='pearson', threshold=0.9)
    drop_correlated.fit(data)
    return pd.DataFrame(drop_correlated.transform(data))


In [81]:
def dropColumnWithName(data, name):
    column_to_drop = [c for c in data if c.startswith(name)]
    data = data.drop(columns = column_to_drop )


In [56]:
def normalise(data):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(data))
  

In [106]:
def main():
    df_train = loadData("train.csv")
    df_test = loadData("test.csv")
    
    ID_train = df_train["ID"].copy()
    ID_test = df_test["ID"].copy()
    df_train = df_train.drop(columns = "ID")
    df_test = df_test.drop(columns = "ID")
    
    df_train = dropDuplicatedRowAndColumn(df_train)
    
    y_train = df_train['TARGET'].copy()
    df_train_x = df_train.drop(columns="TARGET")
#     print(df_train_x)
    df_train_x = quasiConstantRemoval(df_train_x, 0.01)
    

    df_train_x  = dropCorrelatedFeatures(df_train_x)

   
    
    df_train_x = normalise(df_train_x)
    print(df_train_x)

In [107]:
main()

            0         1         2         3         4         5         6    \
0      0.038772 -0.805834 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
1      0.038772  0.033327 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
2      0.038772 -0.805834 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
3      0.038772  0.262189 -0.055166  0.336397  0.119616 -0.039477 -0.043503   
4      0.038772  0.414763 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
...         ...       ...       ...       ...       ...       ...       ...   
71208  0.038772  0.414763 -0.055166  0.853451  0.492882 -0.039477 -0.043503   
71209  0.038772  1.101350 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
71210  0.038772  0.414763 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
71211  0.038772 -0.805834 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
71212  0.038772 -0.653259 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   

            7         8        9    ...       151  

In [30]:
# dropping duplicate rows: PS: we dont have any 
df_train.drop_duplicates(inplace=True)
y_train = df_train['TARGET'].copy()
df_train_x = df_train.drop(columns="TARGET")


In [24]:
print("number of rows", len(df_train))
print("number of colomns", len (df_train_x.T))

number of rows 76020
number of colomns 370


In [10]:
# dropping duplicated columns PS: there are no duplicate columns
df_train_x.T.drop_duplicates(inplace=True)
df_train_x.shape

(76020, 370)

In [31]:
# dropping columns with variance 0 PS: drops 34 columns
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(df_train_x)
df_train_x = pd.DataFrame(constant_filter.transform(df_train_x))
# df_train_x = df_train_x.drop(df_train_x.var()[df.var() == 0].index.values, axis=1)

In [33]:
drop_correlated = DropCorrelatedFeatures(
    variables=None, method='pearson', threshold=0.9)
drop_correlated.fit(df_train_x)
df_train_x =drop_correlated.transform(df_train_x)
df_train_x.shape

(76020, 160)

In [5]:
# dropping columns starting with ind_* they cover the same information
column_to_drop = [name for name in df_train_x if name.startswith("ind_")]
df_train_x = df_train_x.drop(columns = column_to_drop )


In [8]:
# creating a dataframe for the features, not including id and normalising  them
# can we use MinMaxScaler

copy_x = df_train_x.iloc[:,[i for i in range(270) if i != 0]].copy()
c = list(copy_x.columns.values)
scaler = StandardScaler()
# transform data
print()
normalised_features = pd.DataFrame(scaler.fit_transform(copy_x),columns = c)





In [10]:
normalised_features

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var8_ult1,saldo_medio_var8_ult3,saldo_medio_var12_hace2,saldo_medio_var12_hace3,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_hace2,saldo_medio_var13_corto_hace3,saldo_medio_var13_corto_ult1,saldo_medio_var13_corto_ult3
0,0.039074,-0.788249,-0.053388,-0.213263,-0.218813,-0.038206,-0.042103,-0.013493,-0.015538,-0.033177,...,-0.056511,-0.056853,-0.105817,-0.066023,-0.123436,-0.123948,-0.138071,-0.077435,-0.152173,-0.150862
1,0.039074,0.060753,-0.053388,-0.213263,-0.218813,-0.038206,-0.042103,-0.013493,-0.015538,-0.033177,...,-0.056511,-0.056853,-0.105817,-0.066023,-0.123436,-0.123948,-0.126690,-0.060419,-0.142765,-0.141447
2,0.039074,-0.788249,-0.053388,-0.213263,-0.218813,-0.038206,-0.042103,-0.013493,-0.015538,-0.033177,...,-0.056511,-0.056853,-0.105817,-0.066023,-0.123436,-0.123948,-0.138071,-0.077435,-0.152173,-0.150862
3,0.039074,0.292298,-0.053388,0.361427,0.138158,-0.038206,-0.042103,-0.013493,-0.015538,-0.033177,...,-0.056511,-0.056853,-0.105817,-0.066023,-0.123436,-0.123948,-0.138071,-0.077435,-0.152173,-0.150862
4,0.039074,0.446662,-0.053388,-0.213263,-0.218813,-0.038206,-0.042103,-0.013493,-0.015538,-0.033177,...,-0.056511,-0.056853,-0.105817,-0.066023,1.727165,2.284085,-0.138071,-0.077435,-0.152173,-0.150862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,0.039074,1.141300,-0.053388,-0.213263,-0.218813,-0.038206,-0.042103,-0.013493,-0.015538,-0.033177,...,-0.056511,-0.056853,-0.105817,-0.066023,-0.123436,-0.123948,-0.138071,-0.077435,-0.152173,-0.150862
76016,0.039074,0.446662,-0.053388,-0.213263,-0.218813,-0.038206,-0.042103,-0.013493,-0.015538,-0.033177,...,-0.056511,-0.056853,0.304488,-0.066023,0.919276,0.772694,-0.138071,-0.077435,-0.152173,-0.150862
76017,0.039074,-0.788249,-0.053388,-0.213263,-0.218813,-0.038206,-0.042103,-0.013493,-0.015538,-0.033177,...,-0.056511,-0.056853,-0.105817,-0.066023,-0.123436,-0.123948,-0.138071,-0.077435,-0.152173,-0.150862
76018,0.039074,-0.633885,-0.053388,-0.213263,-0.218813,-0.038206,-0.042103,-0.013493,-0.015538,-0.033177,...,-0.056511,-0.056853,-0.105817,-0.066023,-0.123436,-0.123948,-0.138071,-0.077435,-0.152173,-0.150862
