In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ShuffleSplit, StratifiedShuffleSplit
from sklearn.metrics import matthews_corrcoef, accuracy_score, make_scorer
from sklearn.utils import resample
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, SelectKBest, chi2

from xgboost import XGBClassifier

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore') 

In [2]:
VALIDATION = True

train1_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_1_Training.csv',index_col=0).T
train2_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_2_Training.csv',index_col=0).T

test1_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_1_Testing.csv',index_col=0).T
test2_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_2_Testing.csv',index_col=0).T

train1 = train1_df
train2 = train2_df
test1 = test1_df
test2 = test2_df

if(VALIDATION):
    #Shuffling the dataset
    valSize = int(train1.shape[0]*0.7)
    train1 = train1.sample(frac=1).reset_index(drop=True)
    test1 = train1[valSize:]
    train1 = train1[:valSize]
    
    valSize = int(train2.shape[0]*0.7)
    train2 = train2.sample(frac=1).reset_index(drop=True)
    test2 = train2[valSize:]
    train2 = train2[:valSize]

In [3]:
print(train1.shape)
print(test1.shape)
print(train2.shape)
print(test2.shape)

(91, 22285)
(39, 22285)
(237, 54679)
(103, 54679)


In [4]:
test_cols1 = ['CO: 1','CO: 2']
test_cols2 = ['CO: 3','CO: 4','CO: 5','CO: 6']
test_cols = ['CO: 1','CO: 2','CO: 3','CO: 4','CO: 5','CO: 6']

In [5]:
models = [RandomForestClassifier(n_jobs=-1, random_state=0),
          DecisionTreeClassifier(random_state=0),]
#           AdaBoostClassifier(random_state=0),]


def feat_imp(train,test,col):
    X = train.drop(test_cols, axis=1, errors='ignore')
    y = np.array(train[[col]])
    y = y.reshape(y.shape[0],)
    Xtest = test.drop(test_cols, axis=1, errors='ignore')
    
    scaler = preprocessing.StandardScaler()
    scaledX = scaler.fit_transform(X)
    
    scaledXtest = scaler.transform(Xtest)
    
    feat_set = set()
    
    for model in models:
        model.fit(X,y)
        feat_set.update(model.feature_importances_.nonzero()[0])
    
    print(col,len(feat_set))
    return feat_set

In [6]:
feats = {}
for col in test_cols1:
    feats[col] = feat_imp(train1,test1,col)
for col in test_cols2:
    feats[col] = feat_imp(train2,test2,col)

CO: 1 593
CO: 2 707
CO: 3 1537
CO: 4 1254
CO: 5 1534
CO: 6 1902


In [7]:
default_model = SVC()
def pred(train,test,col,model=default_model):
    
#     print(train.iloc[:,list(feats)].shape)
    
    X = train.iloc[:,list(feats[col])].drop(test_cols, axis=1, errors='ignore')
    #X = train.drop(test_cols, axis=1, errors='ignore')
    
    y = np.array(train[[col]])
    y = y.reshape(y.shape[0],)
    Xtest = test.iloc[:,list(feats[col])].drop(test_cols, axis=1, errors='ignore')
    #Xtest = test.drop(test_cols, axis=1, errors='ignore')
    

    if(VALIDATION): 
        ytest = test[[col]]
        ytrue.extend(list(ytest[col]))

    scaler = preprocessing.StandardScaler()
    scaledX = scaler.fit_transform(X)
    
    scaledXtest = scaler.transform(Xtest)
    
#     model = XGBClassifier()
    model.fit(scaledX,y)
      
    ypred.extend(model.predict(scaledXtest))   

    if(VALIDATION): print(col,matthews_corrcoef(ytest,model.predict(scaledXtest)),accuracy_score(ytest,model.predict(scaledXtest)))
    else: print(col)
        
    if VALIDATION: return matthews_corrcoef(ytest,model.predict(scaledXtest))
    else: return

In [8]:
ypred = []
ytrue = []
best_params = []
#pred(train1,test1,'CO: 1',params=random_grid)
pred(train1,test1,'CO: 1')

CO: 1 0.12969963148830185 0.7435897435897436


0.12969963148830185

In [9]:
#pred(train1,test1,'CO: 2',params=random_grid)
pred(train1,test1,'CO: 2')

CO: 2 0.10309826235529031 0.5641025641025641


0.10309826235529031

In [10]:
#pred(train2,test2,'CO: 3',params=random_grid)
pred(train2,test2,'CO: 3')

CO: 3 0.15301409312053074 0.8058252427184466


0.15301409312053074

In [11]:
#pred(train2,test2,'CO: 4',params=random_grid)
pred(train2,test2,'CO: 4')

CO: 4 0.0 0.8737864077669902


0.0

In [12]:
#pred(train2,test2,'CO: 5',params=random_grid)
pred(train2,test2,'CO: 5')

CO: 5 0.5300165593559218 0.7475728155339806


0.5300165593559218

In [13]:
#pred(train2,test2,'CO: 6',params=random_grid)
pred(train2,test2,'CO: 6')

CO: 6 0.23319921927262705 0.6893203883495146


0.23319921927262705

In [14]:
# if(VALIDATION):
#     print(matthews_corrcoef(ytrue,ypred))
# else:
# #     submission = pd.DataFrame(ypred,columns=['Predicted'])
# #     submission.index.name = 'Id'
#     submission = pd.read_csv('dummy_submission.csv')
#     submission.Predicted = np.array(ypred,dtype=int)
#     submission.to_csv('IGiveUp.csv',index=False)
#     print(submission.shape)

In [15]:
# models = [
#     KNeighborsClassifier(3),
#     SVC(random_state=0),
#     SVC(kernel="linear", C=0.025,random_state=0),
# #     GaussianProcessClassifier(random_state=0),
#     DecisionTreeClassifier(random_state=0),
#     RandomForestClassifier(random_state=0),
#     AdaBoostClassifier(random_state=0),
#     GaussianNB(),
#     BernoulliNB(),
# #     MultinomialNB(),
# #     QuadraticDiscriminantAnalysis(),
    
#     LogisticRegression(solver='liblinear',penalty='l1',random_state=0),
#     LogisticRegression(random_state=0),
#     XGBClassifier(random_state=0),
# ]

# names = [
#     'KNeighborsClassifier(3)',
#     'SVC(random_state=0)',
#     'SVC(kernel="linear", C=0.025,random_state=0)',
# #     GaussianProcessClassifier(random_state=0),
#     'DecisionTreeClassifier(random_state=0)',
#     'RandomForestClassifier(random_state=0)',
#     'AdaBoostClassifier(random_state=0)',
#     'GaussianNB()',
#     'BernoulliNB()',
# #     'MultinomialNB()',
# #     QuadraticDiscriminantAnalysis(),
    
#     'LogisticRegression(solver="liblinear",penalty="l1",random_state=0)',
#     'LogisticRegression(random_state=0)',
#     'XGBClassifier(random_state=0)',
# ]

# df = pd.DataFrame({'model':[],'1':[],'2':[],'3':[],'4':[],'5':[],'6':[],'total':[]})

# for i in range(len(models)):
#     model = models[i]
#     name = names[i]
#     print(model)
#     ytrue = []
#     ypred = []
    
#     m1 = pred(train1,test1,'CO: 1',model)
#     m2 = pred(train1,test1,'CO: 2',model)

#     m3 = pred(train2,test2,'CO: 3',model)
#     m4 = pred(train2,test2,'CO: 4',model)
#     m5 = pred(train2,test2,'CO: 5',model)
#     m6 = pred(train2,test2,'CO: 6',model)
    
#     tot = matthews_corrcoef(ytrue,ypred)
#     print(tot)
    
#     df.loc[len(df)] = [name,m1,m2,m3,m4,m5,m6,tot]
    
# df

In [16]:
ytrue = []
ypred = []
m1 = pred(train1,test1,'CO: 1',BernoulliNB())
m2 = pred(train1,test1,'CO: 2',GaussianNB())

m3 = pred(train2,test2,'CO: 3',LogisticRegression(random_state=0))
m4 = pred(train2,test2,'CO: 4',RandomForestClassifier(random_state=0))

# m1 = pred(train1,test1,'CO: 1',RandomForestClassifier(random_state=0))
# m2 = pred(train1,test1,'CO: 2',RandomForestClassifier(random_state=0))

# m3 = pred(train2,test2,'CO: 3',RandomForestClassifier(random_state=0))
# m4 = pred(train2,test2,'CO: 4',RandomForestClassifier(random_state=0))

m5 = pred(train2,test2,'CO: 5',RandomForestClassifier(random_state=0))
m6 = pred(train2,test2,'CO: 6',RandomForestClassifier(random_state=0))

if VALIDATION:
    tot = matthews_corrcoef(ytrue,ypred)
    print(tot)
else:
    submission = pd.read_csv('dummy_submission.csv')
    submission.Predicted = np.array(ypred,dtype=int)
    submission.to_csv('BGLRRR.csv',index=False)
    print(submission.shape)

CO: 1 0.3752393871932282 0.7692307692307693
CO: 2 0.3273268353539886 0.6666666666666666
CO: 3 0.25920549480721095 0.7669902912621359
CO: 4 0.2605250528594531 0.883495145631068
CO: 5 0.8243560325647531 0.912621359223301
CO: 6 0.17798193287014294 0.6699029126213593
0.5610497043083994
