# Introduction to machine learning 2022 term project

In [1]:
# imports
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import cross_validate,cross_val_score, train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,BayesianRidge
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier

In [2]:
npf = pd.read_csv("npf_train.csv")
test = pd.read_csv("npf_test_hidden.csv")

### Data preprocessing

Column "class2" added that is "event" if "class4" is Ib, Ia or II, otherwise labeled as "nonevent". 

In [3]:
class2 = np.array(["nonevent", "event"])
npf["class2"] = class2[(npf["class4"]!="nonevent").astype(int)]


In [4]:
npf.head(n=2)

Unnamed: 0,id,date,class4,partlybad,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,...,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std,class2
0,1,2000-01-17,Ib,False,368.771711,0.310309,368.665658,0.305127,369.371184,0.333606,...,0.250952,-0.899393,0.271648,2.492491,1.31088,0.031587,0.018122,0.000243,3.5e-05,event
1,2,2000-02-28,nonevent,False,378.197295,1.001493,378.083089,1.025472,378.671311,1.017208,...,0.31346,2.033191,0.27109,0.295937,0.177836,0.00514,0.003552,0.003658,0.00094,nonevent


In [5]:
labels = ['CO2168.mean','Glob.mean','H2O168.mean','NET.mean', 'NO168.mean',
'NOx168.mean','O3168.mean','Pamb0.mean','PAR.mean', 'PTG.mean',
'RGlob.mean', 'RHIRGA168.mean','RPAR.mean', 'SO2168.mean',
'SWS.mean', 'T168.mean','UV_A.mean', 'UV_B.mean', 'CS.mean']

In [6]:
#X = npf[labels]
X = npf.filter(regex=".mean")
y = npf["class2"]

In [7]:
Classifier_Names = ["dummyRegressor",
                    "logisticRegressor",
                    "GaussianNaiveBays",
                    "SVC",
                    "SVC-sigmoid",
                    "SVC-poly",
                    "RandomForest",
                    "AdaBoost",
                    "NearestNeighbour",
                    "GaussianProcess",
                    "QDA",
                    "DecisionTree"]


models = [DummyClassifier(),
          LogisticRegression(solver="saga"),
          GaussianNB(),
          SVC(),
          SVC(kernel="sigmoid"),
          SVC(kernel="poly"),
          RandomForestClassifier(),
          AdaBoostClassifier(),
          KNeighborsClassifier(3),
          GaussianProcessClassifier(1.0 * RBF(1.0)),
          QuadraticDiscriminantAnalysis(),
          DecisionTreeClassifier(max_depth=5)
          ]
res = pd.DataFrame(index=Classifier_Names)

In [8]:
res["cv"] = [cross_val_score(m, X, y, cv=10).mean()
             for m in models]



In [9]:
res

Unnamed: 0,cv
dummyRegressor,0.470074
logisticRegressor,0.842831
GaussianNaiveBays,0.806383
SVC,0.773913
SVC-sigmoid,0.763182
SVC-poly,0.780389
RandomForest,0.874931
AdaBoost,0.8537
NearestNeighbour,0.803978
GaussianProcess,0.623358
