In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ShuffleSplit, StratifiedShuffleSplit
from sklearn.metrics import matthews_corrcoef, accuracy_score, make_scorer
from sklearn.utils import resample
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, SelectKBest, chi2

from xgboost import XGBClassifier

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore') 

In [2]:
VALIDATION = True

train1_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_1_Training.csv',index_col=0).T
train2_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_2_Training.csv',index_col=0).T

test1_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_1_Testing.csv',index_col=0).T
test2_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_2_Testing.csv',index_col=0).T

train1 = train1_df
train2 = train2_df
test1 = test1_df
test2 = test2_df

if(VALIDATION):
    #Shuffling the dataset
    valSize = int(train1.shape[0]*0.7)
    train1 = train1.sample(frac=1).reset_index(drop=True)
    test1 = train1[valSize:]
    train1 = train1[:valSize]
    
    valSize = int(train2.shape[0]*0.7)
    train2 = train2.sample(frac=1).reset_index(drop=True)
    test2 = train2[valSize:]
    train2 = train2[:valSize]

In [3]:
print(train1.shape)
print(test1.shape)
print(train2.shape)
print(test2.shape)

(91, 22285)
(39, 22285)
(237, 54679)
(103, 54679)


In [4]:
test_cols1 = ['CO: 1','CO: 2']
test_cols2 = ['CO: 3','CO: 4','CO: 5','CO: 6']
test_cols = ['CO: 1','CO: 2','CO: 3','CO: 4','CO: 5','CO: 6']

In [5]:
models = [RandomForestClassifier(n_jobs=-1, random_state=0),
          DecisionTreeClassifier(random_state=0),]
#           AdaBoostClassifier(random_state=0),]


def feat_imp(train,test,col):
    X = train.drop(test_cols, axis=1, errors='ignore')
    y = np.array(train[[col]])
    y = y.reshape(y.shape[0],)
    Xtest = test.drop(test_cols, axis=1, errors='ignore')
    
    scaler = preprocessing.StandardScaler()
    scaledX = scaler.fit_transform(X)
    
    scaledXtest = scaler.transform(Xtest)
    
    feat_set = set()
    
    for model in models:
        model.fit(X,y)
        feat_set.update(model.feature_importances_.nonzero()[0])
    
    print(col,len(feat_set))
    return feat_set

In [6]:
feats = {}
for col in test_cols1:
    feats[col] = feat_imp(train1,test1,col)
for col in test_cols2:
    feats[col] = feat_imp(train2,test2,col)

CO: 1 633
CO: 2 702
CO: 3 1559
CO: 4 1247
CO: 5 1607
CO: 6 1834


In [7]:
default_model = SVC()
def pred(train,test,col,model=default_model):
    
#     print(train.iloc[:,list(feats)].shape)
    
    X = train.iloc[:,list(feats[col])].drop(test_cols, axis=1, errors='ignore')
    y = np.array(train[[col]])
    y = y.reshape(y.shape[0],)
    Xtest = test.iloc[:,list(feats[col])].drop(test_cols, axis=1, errors='ignore')

    if(VALIDATION): 
        ytest = test[[col]]
        ytrue.extend(list(ytest[col]))

    scaler = preprocessing.StandardScaler()
    scaledX = scaler.fit_transform(X)
    
    scaledXtest = scaler.transform(Xtest)
    
#     model = XGBClassifier()
    model.fit(scaledX,y)
      
    ypred.extend(model.predict(scaledXtest))   

    if(VALIDATION): print(col,matthews_corrcoef(ytest,model.predict(scaledXtest)),accuracy_score(ytest,model.predict(scaledXtest)))
    else: print(col)
        
    return matthews_corrcoef(ytest,model.predict(scaledXtest))

In [8]:
ypred = []
ytrue = []
best_params = []
#pred(train1,test1,'CO: 1',params=random_grid)
pred(train1,test1,'CO: 1')

CO: 1 0.31137995761748233 0.7948717948717948


0.31137995761748233

In [9]:
#pred(train1,test1,'CO: 2',params=random_grid)
pred(train1,test1,'CO: 2')

CO: 2 0.4258672210719905 0.6666666666666666


0.4258672210719905

In [10]:
#pred(train2,test2,'CO: 3',params=random_grid)
pred(train2,test2,'CO: 3')

CO: 3 0.17039568891083096 0.7572815533980582


0.17039568891083096

In [11]:
#pred(train2,test2,'CO: 4',params=random_grid)
pred(train2,test2,'CO: 4')

CO: 4 0.0 0.8640776699029126


0.0

In [12]:
#pred(train2,test2,'CO: 5',params=random_grid)
pred(train2,test2,'CO: 5')

CO: 5 0.648991593536552 0.8252427184466019


0.648991593536552

In [13]:
#pred(train2,test2,'CO: 6',params=random_grid)
pred(train2,test2,'CO: 6')

CO: 6 0.07667114370239701 0.5631067961165048


0.07667114370239701

In [14]:
if(VALIDATION):
    print(matthews_corrcoef(ytrue,ypred))
else:
#     submission = pd.DataFrame(ypred,columns=['Predicted'])
#     submission.index.name = 'Id'
    submission = pd.read_csv('dummy_submission.csv')
    submission.Predicted = np.array(ypred,dtype=int)
    submission.to_csv('IGiveUp.csv',index=False)
    print(submission.shape)

0.4423164051628146


In [15]:
# models = [RandomForestClassifier(n_estimators=750, n_jobs=-1, random_state=0),
#           AdaBoostClassifier(random_state=0),
#           DecisionTreeClassifier(random_state=0),
#           LogisticRegression(solver='liblinear',penalty='l1'),
#           SVC(),
#           XGBClassifier()]

models = [
    KNeighborsClassifier(3),
    SVC(random_state=0),
    SVC(kernel="linear", C=0.025,random_state=0),
#     GaussianProcessClassifier(random_state=0),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(random_state=0),
    AdaBoostClassifier(random_state=0),
    GaussianNB(),
#     QuadraticDiscriminantAnalysis(),
    
    LogisticRegression(solver='liblinear',penalty='l1',random_state=0),
    LogisticRegression(random_state=0),
    XGBClassifier(random_state=0),
]

names = [
    'KNeighborsClassifier(3)',
    'SVC(random_state=0)',
    'SVC(kernel="linear", C=0.025,random_state=0)',
#     GaussianProcessClassifier(random_state=0),
    'DecisionTreeClassifier(random_state=0)',
    'RandomForestClassifier(random_state=0)',
    'AdaBoostClassifier(random_state=0)',
    'GaussianNB()',
#     QuadraticDiscriminantAnalysis(),
    
    'LogisticRegression(solver="liblinear",penalty="l1",random_state=0)',
    'LogisticRegression(random_state=0)',
    'XGBClassifier(random_state=0)',
]

df = pd.DataFrame({'model':[],'1':[],'2':[],'3':[],'4':[],'5':[],'6':[],'total':[]})

for i in range(len(models)):
    model = models[i]
    name = names[i]
    print(model)
    ytrue = []
    ypred = []
    
    m1 = pred(train1,test1,'CO: 1',model)
    m2 = pred(train1,test1,'CO: 2',model)

    m3 = pred(train2,test2,'CO: 3',model)
    m4 = pred(train2,test2,'CO: 4',model)
    m5 = pred(train2,test2,'CO: 5',model)
    m6 = pred(train2,test2,'CO: 6',model)
    
    tot = matthews_corrcoef(ytrue,ypred)
    print(tot)
    
    df.loc[len(df)] = [name,m1,m2,m3,m4,m5,m6,tot]
    
df

KNeighborsClassifier(n_neighbors=3)
CO: 1 0.4753323681541725 0.8205128205128205
CO: 2 0.4147575310031266 0.6923076923076923
CO: 3 0.11455671257489745 0.7475728155339806
CO: 4 0.0 0.8640776699029126
CO: 5 0.39570075607600547 0.7087378640776699
CO: 6 -0.13462409458259064 0.47572815533980584
0.3322417128093336
SVC(random_state=0)
CO: 1 0.31137995761748233 0.7948717948717948
CO: 2 0.4258672210719905 0.6666666666666666
CO: 3 0.17039568891083096 0.7572815533980582
CO: 4 0.0 0.8640776699029126
CO: 5 0.648991593536552 0.8252427184466019
CO: 6 0.07667114370239701 0.5631067961165048
0.4423164051628146
SVC(C=0.025, kernel='linear', random_state=0)
CO: 1 0.13538259026847058 0.7435897435897436
CO: 2 0.23809523809523808 0.6153846153846154
CO: 3 0.33033552678742245 0.7766990291262136
CO: 4 0.06691003295964197 0.8446601941747572
CO: 5 0.7062141420904177 0.8543689320388349
CO: 6 0.04552946561764083 0.5436893203883495
0.42796241597638196
DecisionTreeClassifier(random_state=0)
CO: 1 0.3246172270321178 0.

Unnamed: 0,model,1,2,3,4,5,6,total
0,KNeighborsClassifier(3),0.475332,0.414758,0.114557,0.0,0.395701,-0.134624,0.332242
1,SVC(random_state=0),0.31138,0.425867,0.170396,0.0,0.648992,0.076671,0.442316
2,"SVC(kernel=""linear"", C=0.025,random_state=0)",0.135383,0.238095,0.330336,0.06691,0.706214,0.045529,0.427962
3,DecisionTreeClassifier(random_state=0),0.324617,0.284961,-0.002743,-0.100023,0.552868,0.103978,0.305285
4,RandomForestClassifier(random_state=0),0.375008,0.35169,0.165165,0.0,0.721094,0.035221,0.450551
5,AdaBoostClassifier(random_state=0),0.135383,0.040218,0.299858,0.042233,0.749366,-0.121117,0.371423
6,GaussianNB(),0.475332,0.295217,0.22367,0.29917,0.626125,0.184503,0.421551
7,"LogisticRegression(solver=""liblinear"",penalty=...",0.258809,0.545545,0.166758,0.042233,0.763904,0.197283,0.467116
8,LogisticRegression(random_state=0),0.258809,0.263932,0.30561,0.174053,0.72768,-0.018354,0.434742
9,XGBClassifier(random_state=0),0.31138,0.458486,0.248885,0.149495,0.815976,0.059202,0.486372
