In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef, accuracy_score
from sklearn.utils import resample
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, SelectKBest, chi2

from xgboost import XGBClassifier

import matplotlib.pyplot as plt

In [2]:
VALIDATION = True

train1 = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_1_Training.csv',index_col=0).T
train2 = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_2_Training.csv',index_col=0).T

test1 = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_1_Testing.csv',index_col=0).T
test2 = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_2_Testing.csv',index_col=0).T

if(VALIDATION):
    #Shuffling the dataset
    valSize = int(train1.shape[0]*0.75)
    train1 = train1.sample(frac=1).reset_index(drop=True)
    test1 = train1[valSize:]
    train1 = train1[:valSize]
    
    valSize = int(train2.shape[0]*0.75)
    train2 = train2.sample(frac=1).reset_index(drop=True)
    test2 = train2[valSize:]
    train2 = train2[:valSize]

In [3]:
print(train1.shape)
print(test1.shape)
print(train2.shape)
print(test2.shape)

(97, 22285)
(33, 22285)
(255, 54679)
(85, 54679)


In [4]:
test_cols1 = ['CO: 1','CO: 2']
test_cols2 = ['CO: 3','CO: 4','CO: 5','CO: 6']

In [5]:
#Try SMOTE later
# def upsample(train,test_cols):
#     train_data = train
#     for col in test_cols:
#         # separate minority and majority classes
#         negative = train_data[train_data[col]==0]
#         positive = train_data[train_data[col]==1]
#         # upsample minority
#         pos_upsampled = resample(positive,
#          replace=True, # sample with replacement
#          n_samples=len(negative), # match number in majority class
#          random_state=27) # reproducible results
#         # combine majority and upsampled minority
#         upsampled = pd.concat([negative, pos_upsampled])
#         train_data = upsampled
#     return train_data

# train1 = upsample(train1,test_cols1)
# train2 = upsample(train2,test_cols2)

In [6]:
scaler1 = preprocessing.StandardScaler()
scaler1.fit(train1.drop(test_cols1, axis=1, errors='ignore'))

scaler2 = preprocessing.StandardScaler()
scaler2.fit(train2.drop(test_cols2, axis=1, errors='ignore'))

StandardScaler()

In [23]:
ypred = []
ytrue = []
models = [RandomForestClassifier(n_estimators=750, n_jobs=-1, random_state=0),
          AdaBoostClassifier(),
          DecisionTreeClassifier(random_state=0),
          LogisticRegression(solver='liblinear',penalty='l1'),
          SVC(probability=True)]

#models = [XGBClassifier()]

def pred(test_cols,train,test):
    for col in test_cols:
        X = train.drop(test_cols, axis=1, errors='ignore')
        y = np.array(train[[col]])
        y = y.reshape(y.shape[0],)
        Xtest = test.drop(test_cols, axis=1, errors='ignore')
        
        if(VALIDATION): 
            ytest = test[[col]]
            ytrue.extend(list(ytest[col]))

        scaler = preprocessing.StandardScaler()
        scaledX = scaler.fit_transform(X)
        pca = PCA()
        scaledX = pca.fit_transform(scaledX)
#         selector = SelectPercentile(percentile=10)
#         scaledX = selector.fit_transform(scaledX,y)
#         print(scaledX.shape)
        
        scaledXtest = scaler.transform(Xtest)
        scaledXtest = pca.transform(scaledXtest)
#         scaledXtest = selector.transform(scaledXtest)

        #model = RandomForestClassifier(n_estimators=750, n_jobs=-1, random_state=0)
        #model = AdaBoostClassifier() #Not performing well
        #model = LogisticRegression(solver='liblinear',penalty='l1') #Fast
        #model = DecisionTreeClassifier(random_state=0) #Not performing well
        #model = XGBClassifier()
        
        stackedX = []
        stackedXtest = []
        for model in models:
            model.fit(scaledX,y)
            stackedX.append(model.predict_proba(scaledX)[:,1])
            stackedXtest.append(model.predict_proba(scaledXtest)[:,1])
            
        stackedX = np.array(stackedX).T
        stackedXtest = np.array(stackedXtest).T
        print(stackedX.shape)
        model = DecisionTreeClassifier(random_state=0)
        #model.fit(scaledX,y)
        model.fit(stackedX,y)  
        
        #ypred.extend(model.predict(scaledXtest))
        ypred.extend(model.predict(stackedXtest))
        
        #if(VALIDATION): print(col,model.score(scaledXtest,ytest))
        if(VALIDATION): print(col,matthews_corrcoef(ytest,model.predict(stackedXtest)),model.score(stackedXtest,ytest))
        else: print(col)

In [24]:
pred(test_cols1,train1,test1)

(97, 5)
CO: 1 0.1942057042751279 0.7575757575757576
(97, 5)
CO: 2 0.24479280153992708 0.6363636363636364


In [25]:
if(VALIDATION):
    print(matthews_corrcoef(ytrue,ypred))

0.2025090917415868


In [26]:
pred(test_cols2,train2,test2)

(255, 5)
CO: 3 0.23410341035473953 0.8235294117647058
(255, 5)
CO: 4 -0.03516899742266649 0.8941176470588236
(255, 5)
CO: 5 0.03669371373282348 0.5411764705882353


KeyboardInterrupt: 

In [None]:
if(VALIDATION):
    print(matthews_corrcoef(ytrue,ypred))
else:
#     submission = pd.DataFrame(ypred,columns=['Predicted'])
#     submission.index.name = 'Id'
    submission = pd.read_csv('dummy_submission.csv')
    submission.Predicted = np.array(ypred,dtype=int)
    submission.to_csv('RF2000.csv',index=False)
    print(submission.shape)