In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ShuffleSplit, StratifiedShuffleSplit
from sklearn.metrics import matthews_corrcoef, accuracy_score, make_scorer
from sklearn.utils import resample
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, SelectKBest, chi2
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

import matplotlib.pyplot as plt

In [2]:
VALIDATION = False

train1_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_1_Training.csv',index_col=0).T
train2_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_2_Training.csv',index_col=0).T

test1_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_1_Testing.csv',index_col=0).T
test2_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_2_Testing.csv',index_col=0).T

train1 = train1_df
train2 = train2_df
test1 = test1_df
test2 = test2_df

if(VALIDATION):
    #Shuffling the dataset
    valSize = int(train1.shape[0]*0.7)
    train1 = train1.sample(frac=1).reset_index(drop=True)
    test1 = train1[valSize:]
    train1 = train1[:valSize]
    
    valSize = int(train2.shape[0]*0.7)
    train2 = train2.sample(frac=1).reset_index(drop=True)
    test2 = train2[valSize:]
    train2 = train2[:valSize]

In [3]:
print(train1.shape)
print(test1.shape)
print(train2.shape)
print(test2.shape)

(130, 22285)
(100, 22283)
(340, 54679)
(214, 54675)


In [4]:
test_cols1 = ['CO: 1','CO: 2']
test_cols2 = ['CO: 3','CO: 4','CO: 5','CO: 6']
test_cols = ['CO: 1','CO: 2','CO: 3','CO: 4','CO: 5','CO: 6']

In [5]:
#For PCA RF
random_grid = {'model__n_estimators': [10, 25, 50, 100, 250, 500],
               'model__max_depth': [3, 5, 10, 25, None],
               'select__k': [500,1000,1500,2000,2500]}

# #For PCA SVC
# random_grid = {'C': [0.25, 0.5, 1, 2.5, 5],
#                'kernel': ['linear', 'poly', 'rbf'],
#                'gamma': ['scale','auto']}

In [6]:
default_param = random_grid
default_model = RandomForestClassifier(random_state=0)
def pred(train,test,col,model=default_model,params=default_param):
    X = train.drop(test_cols, axis=1, errors='ignore')
    y = np.array(train[[col]])
    y = y.reshape(y.shape[0],)
    Xtest = test.drop(test_cols, axis=1, errors='ignore')

    if(VALIDATION): 
        ytest = test[[col]]
        ytrue.extend(list(ytest[col]))
        
    pipe = Pipeline(steps=[("scale", StandardScaler()),("select", SelectKBest()), ("model", model)])
    
    model = GridSearchCV(pipe,params,verbose=1,n_jobs=4,cv=ShuffleSplit(n_splits=5,test_size=0.3,random_state=0))
    model.fit(X,y)
    best_params.append(model.best_params_)
    print(model.best_params_)
    
    print(np.unique(model.predict(Xtest),return_counts=True))
      
    ypred.extend(model.predict(Xtest))   

    if(VALIDATION): print(col,model.score(Xtest,ytest),accuracy_score(ytest,model.predict(Xtest)))
    else: print(col)
        
    return pd.DataFrame(model.cv_results_).sort_values(by=['rank_test_score']).head(10)

In [7]:
ypred = []
ytrue = []
best_params = []

random_grid = {'model__n_estimators': [25],
               'model__max_depth': [3],
               'select__k': [2500]}
pred(train1,test1,'CO: 1',model=RandomForestClassifier(random_state=0),params=random_grid)
# pred(train1,test1,'CO: 1')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    3.5s finished


{'model__max_depth': 3, 'model__n_estimators': 25, 'select__k': 2500}
(array([0., 1.]), array([74, 26], dtype=int64))
CO: 1


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_estimators,param_select__k,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.583576,0.070572,0.159165,0.010823,3,25,2500,"{'model__max_depth': 3, 'model__n_estimators':...",0.641026,0.846154,0.846154,0.666667,0.74359,0.748718,0.086422,1


In [8]:
random_grid = {'model__n_estimators': [50],
               'model__max_depth': [3],
               'select__k': [500]}
pred(train1,test1,'CO: 2',params=random_grid)
#pred(train1,test1,'CO: 2')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    1.6s finished


{'model__max_depth': 3, 'model__n_estimators': 50, 'select__k': 500}
(array([0., 1.]), array([65, 35], dtype=int64))
CO: 2


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_estimators,param_select__k,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.647231,0.017434,0.186781,0.01995,3,50,500,"{'model__max_depth': 3, 'model__n_estimators':...",0.769231,0.74359,0.641026,0.717949,0.666667,0.707692,0.047557,1


In [9]:
random_grid = {'model__n_estimators': [200],
               'model__max_depth': [5],
               'select__k': [2500]}
pred(train2,test2,'CO: 3',params=random_grid)
#pred(train2,test2,'CO: 3')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    5.5s finished


{'model__max_depth': 5, 'model__n_estimators': 200, 'select__k': 2500}
(array([0., 1.]), array([196,  18], dtype=int64))
CO: 3


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_estimators,param_select__k,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.70706,0.4903,0.318426,0.045525,5,200,2500,"{'model__max_depth': 5, 'model__n_estimators':...",0.666667,0.735294,0.823529,0.813725,0.745098,0.756863,0.057301,1


In [10]:
random_grid = {'model__n_estimators': [25],
               'model__max_depth': [5],
               'select__k': [1500]}
pred(train2,test2,'CO: 4',params=random_grid)
#pred(train2,test2,'CO: 4')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    3.9s finished


{'model__max_depth': 5, 'model__n_estimators': 25, 'select__k': 1500}
(array([0., 1.]), array([207,   7], dtype=int64))
CO: 4


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_estimators,param_select__k,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.826826,0.3662,0.382204,0.081694,5,25,1500,"{'model__max_depth': 5, 'model__n_estimators':...",0.803922,0.803922,0.882353,0.852941,0.872549,0.843137,0.033391,1


In [11]:
random_grid = {'model__n_estimators': [50],
               'model__max_depth': [5],
               'select__k': [500]}
pred(train2,test2,'CO: 5',params=random_grid)
#pred(train2,test2,'CO: 5')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    3.9s finished


{'model__max_depth': 5, 'model__n_estimators': 50, 'select__k': 500}
(array([0., 1.]), array([128,  86], dtype=int64))
CO: 5


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__n_estimators,param_select__k,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.849647,0.334896,0.33681,0.062077,5,50,500,"{'model__max_depth': 5, 'model__n_estimators':...",0.901961,0.852941,0.921569,0.911765,0.862745,0.890196,0.027311,1


In [12]:
svc_grid = {'model__kernel': ['poly'],
            'model__C': [0.25],
            'select__k': [1500]}
pred(train2,test2,'CO: 6',SVC(random_state=0),svc_grid)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    3.9s finished


{'model__C': 0.25, 'model__kernel': 'poly', 'select__k': 1500}
(array([0., 1.]), array([ 15, 199], dtype=int64))
CO: 6


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__kernel,param_select__k,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.851841,0.315024,0.407531,0.069975,0.25,poly,1500,"{'model__C': 0.25, 'model__kernel': 'poly', 's...",0.607843,0.54902,0.578431,0.54902,0.637255,0.584314,0.0343,1


In [13]:
# pred(train1,test1,'CO: 1')
# pred(train1,test1,'CO: 2')

# pred(train2,test2,'CO: 3')
# pred(train2,test2,'CO: 4')
# pred(train2,test2,'CO: 5')
# pred(train2,test2,'CO: 6')

In [14]:
print(best_params)

[{'model__max_depth': 3, 'model__n_estimators': 25, 'select__k': 2500}, {'model__max_depth': 3, 'model__n_estimators': 50, 'select__k': 500}, {'model__max_depth': 5, 'model__n_estimators': 200, 'select__k': 2500}, {'model__max_depth': 5, 'model__n_estimators': 25, 'select__k': 1500}, {'model__max_depth': 5, 'model__n_estimators': 50, 'select__k': 500}, {'model__C': 0.25, 'model__kernel': 'poly', 'select__k': 1500}]


In [15]:
if(VALIDATION):
    print(matthews_corrcoef(ytrue,ypred))
else:
    submission = pd.read_csv('dummy_submission.csv')
    submission.Predicted = np.array(ypred,dtype=int)
    submission.to_csv('Final.csv',index=False)
    print(submission.shape)

(1056, 2)


In [16]:
df = pd.read_csv('IGiveUp.csv')
df

Unnamed: 0,Id,Predicted
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
1051,1051,1
1052,1052,1
1053,1053,1
1054,1054,1


In [17]:
print(np.unique(df['Predicted'][:100],return_counts=True))
print(np.unique(df['Predicted'][100:200],return_counts=True))

print(np.unique(df['Predicted'][200:414],return_counts=True))
print(np.unique(df['Predicted'][414:628],return_counts=True))
print(np.unique(df['Predicted'][628:842],return_counts=True))
print(np.unique(df['Predicted'][842:1056],return_counts=True))

(array([0, 1], dtype=int64), array([74, 26], dtype=int64))
(array([0, 1], dtype=int64), array([65, 35], dtype=int64))
(array([0, 1], dtype=int64), array([196,  18], dtype=int64))
(array([0, 1], dtype=int64), array([207,   7], dtype=int64))
(array([0, 1], dtype=int64), array([128,  86], dtype=int64))
(array([0, 1], dtype=int64), array([ 15, 199], dtype=int64))


In [18]:
from sklearn.metrics import confusion_matrix
df1 = pd.read_csv('Final.csv')
confusion_matrix(df['Predicted'],df1['Predicted'])

array([[685,   0],
       [  0, 371]], dtype=int64)

In [19]:
print(np.unique(df1['Predicted'][:100],return_counts=True))
print(np.unique(df1['Predicted'][100:200],return_counts=True))

print(np.unique(df1['Predicted'][200:414],return_counts=True))
print(np.unique(df1['Predicted'][414:628],return_counts=True))
print(np.unique(df1['Predicted'][628:842],return_counts=True))
print(np.unique(df1['Predicted'][842:1056],return_counts=True))

(array([0, 1], dtype=int64), array([74, 26], dtype=int64))
(array([0, 1], dtype=int64), array([65, 35], dtype=int64))
(array([0, 1], dtype=int64), array([196,  18], dtype=int64))
(array([0, 1], dtype=int64), array([207,   7], dtype=int64))
(array([0, 1], dtype=int64), array([128,  86], dtype=int64))
(array([0, 1], dtype=int64), array([ 15, 199], dtype=int64))
