In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

In [3]:
data_train = pd.read_csv('santander-train.csv')

In [4]:
data_test = pd.read_csv('santander-test.csv')

In [5]:
data = [data_train, data_test]
data = pd.concat(data)

In [6]:
data = data.head(20000)

In [7]:
data

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.170000,0.0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.030000,0.0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.770000,0.0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.970000,0.0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,40055,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0.0
19996,40058,2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86339.040000,0.0
19997,40059,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74521.200000,0.0
19998,40065,2,73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,138092.430000,0.0


In [8]:
X = data.drop(['TARGET'], axis =1)
y = data['TARGET']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =0, test_size = 0.2, stratify= y)

## Remove Constant, Quasi Constant, and Duplicated

In [10]:
constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)

X_train_quasi_filter = constant_filter.transform(X_train)
X_test_quasi_filter = constant_filter.transform(X_test)

In [11]:
X_train_T = X_train_quasi_filter.T
X_test_T = X_test_quasi_filter.T

X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)

duplicated_features = X_train_T.duplicated()


features_to_keep = [not index for index in duplicated_features] ## ambil yg bukan duplicated

X_train_unique = X_train_T[features_to_keep].T ## jgn lupa ditranspore lagi
X_test_unique = X_test_T[features_to_keep].T

In [12]:
X_train_unique.shape, X_test_unique.shape

((16000, 227), (4000, 227))

## Remove Multicolinearity Features

In [13]:
corrmat = X_train_unique.corr()

In [14]:
def get_correlation(data, threshold):
    
    corr_col = set() ## pakai set biar tetap unique data
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j]) > threshold: ## ambil correlation feature yg di atas threshold
                colname = corrmat.columns[i] ## ambil nama columnya
                corr_col.add(colname)
            
    return corr_col

corr_features = get_correlation(X_train_unique, 0.7)
print('correlated features: ', len(set(corr_features)))

correlated features:  148


In [15]:
X_train_uncorr = X_train_unique.drop(labels = corr_features, axis =1)
X_test_uncorr = X_test_unique.drop(labels = corr_features, axis =1)

In [16]:
X_train_uncorr.shape, X_test_uncorr.shape

((16000, 79), (4000, 79))

## Feature Selection Using LDA

In [19]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [20]:
lda = LDA(n_components = 1)

In [22]:
X_train_lda = lda.fit_transform(X_train_uncorr, y_train)

In [23]:
X_train_lda

array([[-0.85882526],
       [ 0.65214263],
       [-1.01475523],
       ...,
       [ 1.72799436],
       [-0.96410962],
       [-0.76560697]])

In [24]:
X_test_lda = lda.transform(X_test_uncorr) ##transform lgsg karena udah di fit

In [25]:
def run_randomForest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators = 100, random_state = 0, n_jobs = -1 )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Accuracy on test set: ')
    print(accuracy_score(y_test, y_pred))

In [26]:
%%time 
run_randomForest(X_train, X_test, y_train, y_test)

Accuracy on test set: 
0.9585
Wall time: 1.96 s


In [27]:
%%time 
run_randomForest(X_train_lda, X_test_lda, y_train, y_test)

Accuracy on test set: 
0.93025
Wall time: 1.11 s


### Feature Reduction Using PCA

In [28]:
from sklearn.decomposition import PCA

In [29]:
pca = PCA(n_components = 2, random_state = 42)

In [30]:
pca.fit(X_test_uncorr)

PCA(n_components=2, random_state=42)

In [33]:
X_train_pca = pca.transform(X_train_uncorr)
X_test_pca = pca.transform(X_test_uncorr)

In [34]:
%%time 
run_randomForest(X_train_pca, X_test_pca, y_train, y_test)

Accuracy on test set: 
0.95925
Wall time: 876 ms


In [35]:
for component in range(1,79):
    pca = PCA(n_components = component, random_state = 42)
    pca.fit(X_test_uncorr)
    X_train_pca = pca.transform(X_train_uncorr)
    X_test_pca = pca.transform(X_test_uncorr)
    print('Selected Component: ', component)
    run_randomForest(X_train_pca, X_test_pca, y_train, y_test)
    print()

Selected Component:  1
Accuracy on test set: 
0.95925

Selected Component:  2
Accuracy on test set: 
0.95925

Selected Component:  3
Accuracy on test set: 
0.95925

Selected Component:  4
Accuracy on test set: 
0.959

Selected Component:  5
Accuracy on test set: 
0.9585

Selected Component:  6
Accuracy on test set: 
0.95875

Selected Component:  7
Accuracy on test set: 
0.95825

Selected Component:  8
Accuracy on test set: 
0.9195

Selected Component:  9
Accuracy on test set: 
0.9535

Selected Component:  10
Accuracy on test set: 
0.95625

Selected Component:  11
Accuracy on test set: 
0.95575

Selected Component:  12
Accuracy on test set: 
0.95625

Selected Component:  13
Accuracy on test set: 
0.95625

Selected Component:  14
Accuracy on test set: 
0.9565

Selected Component:  15
Accuracy on test set: 
0.9565

Selected Component:  16
Accuracy on test set: 
0.95675

Selected Component:  17
Accuracy on test set: 
0.95675

Selected Component:  18
Accuracy on test set: 
0.9575

Selected 