# **Data Import**

In [1]:
import gzip
import numpy as np
import os
import requests 
from sklearn.svm import SVC, LinearSVC
import pandas as pd
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from scipy import interp
from itertools import cycle
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [14]:
# Decision stump used as weak classifier
class DecisionStump:
    def __init__(self):
        self.polarity = 1
        self.feature_idx = None
        self.threshold = None
        self.alpha = None
    def predict(self, X):
        n_samples = X.shape[0]
        X_column = X[:, self.feature_idx]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X_column < self.threshold] = -1
        else:
            predictions[X_column > self.threshold] = -1
        return predictions
class AdaBoostClassifier():
    def __init__(self, n_clf=5):
        self.n_clf = n_clf
        self.clfs = []
    def fit(self, X, y):
        n_samples, n_features = X.shape
        # Initialize weights to 1/N
        w = np.full(n_samples, (1 / n_samples))
        self.clfs = []
        # Iterate through classifiers
        for _ in range(self.n_clf):
            clf = DecisionStump()
            min_error = float("inf")
            # greedy search to find best threshold and feature
            for feature_i in range(n_features):
                X_column = X[:, feature_i]
                thresholds = np.unique(X_column)
                for threshold in thresholds:
                    # predict with polarity 1
                    p = 1
                    predictions = np.ones(n_samples)
                    predictions[X_column < threshold] = -1
                    # Error = sum of weights of misclassified samples
                    misclassified = w[y != predictions]
                    error = sum(misclassified)
                    if error > 0.5:
                        error = 1 - error
                        p = -1
                    # store the best configuration
                    if error < min_error:
                        clf.polarity = p
                        clf.threshold = threshold
                        clf.feature_idx = feature_i
                        min_error = error
            # calculate alpha
            EPS = 1e-10
            clf.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS))
            # calculate predictions and update weights
            predictions = clf.predict(X)
            w *= np.exp(-clf.alpha * y * predictions)
            # Normalize to one
            w /= np.sum(w)
            # Save classifier
            self.clfs.append(clf)
    def predict(self, X):
        clf_preds = [clf.alpha * clf.predict(X) for clf in self.clfs]
        y_pred = np.sum(clf_preds, axis=0)
        y_pred = np.sign(y_pred)
        return y_pred

In [2]:
df = pd.read_csv('Laterality.csv')
X= df.drop(['Overall_Laterality_NO'], axis=1, inplace=False)
y=df['Overall_Laterality_NO']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
encoder = LabelEncoder()
categorical_features = X_train.columns.tolist()
for each in categorical_features:
    X_train[each] = encoder.fit_transform(X_train[each])
test_cat_features = X_test.columns.tolist()
for col in test_cat_features:
    X_test[col] = encoder.fit_transform(X_test[col])
# feature scaling
scaler = StandardScaler()
train = scaler.fit_transform(X_train)
test = scaler.transform(X_test)
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
for k in [5,7]:# and 10
    print('Cross Validation Score for k={}:'.format(k) , cross_val_score(clf, X_train, y_train, cv=k))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
#confusion matrix
print('confusion ' , confusion_matrix(y_test, y_pred))
#precision
print('Precision:' ,  precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))
#recall
print('Recal :' , recall_score(y_test, y_pred, average='weighted'))
#f1 score
print('f1 score  ' , f1_score(y_test,y_pred, average= 'weighted'))

Cross Validation Score for k=5: [1.  0.8 0.4 1.  1. ]
Cross Validation Score for k=7: [1.         0.75       0.66666667 0.33333333 0.66666667 1.
 1.        ]
Accuracy: 0.5833333333333334
confusion  [[7 0]
 [5 0]]
Precision: 0.5833333333333334
Recal : 0.5833333333333334
f1 score   0.4298245614035088


In [4]:
df = pd.read_csv('COVID_Dataset.csv')
X= df.drop(['Infection'], axis=1, inplace=False)
y=df['Infection']

In [5]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X, y)
y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
#confusion matrix
print('confusion ' , confusion_matrix(y_test, y_pred))
#precision
print('Precision:' ,  precision_score(y_test, y_pred, average='weighted'))
#recall
print('Recal :' , recall_score(y_test, y_pred, average='weighted'))
#f1 score
print('f1 score  ' , f1_score(y_test,y_pred, average= 'weighted'))

Accuracy: 0.7677651515151516
confusion  [[15563  2002]
 [ 4129  4706]]
Precision: 0.7606130693064311
Recal : 0.7677651515151516
f1 score   0.7585040640005539


In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X, y)
y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
#confusion matrix
print('confusion ' , confusion_matrix(y_test, y_pred))
#precision
print('Precision:' ,  precision_score(y_test, y_pred, average='weighted'))
#recall
print('Recal :' , recall_score(y_test, y_pred, average='weighted'))
#f1 score
print('f1 score  ' , f1_score(y_test,y_pred, average= 'weighted'))

Accuracy: 1.0
confusion  [[17565     0]
 [    0  8835]]
Precision: 1.0
Recal : 1.0
f1 score   1.0


## **AdaBoost , k = 5**

In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.33, random_state=42)
xconcat = [xtrain, xtest]
yconcat = [ytrain, ytest]
X = pd.concat(xconcat)
y = pd.concat(yconcat)

In [8]:
k = 5
kf = KFold(n_splits=k, random_state=None)
model = OneVsRestClassifier(AdaBoostClassifier())

In [9]:
accuracyscore = []
for train_index, test_index in kf.split(X):
  xtrain , xtest = X.iloc[train_index,:],X.iloc[test_index,:]
  ytrain , ytest = y.iloc[train_index] , y.iloc[test_index]
  
  model.fit(xtrain, ytrain)
  predvalues = model.predict(xtest)
  acc = metrics.accuracy_score(predvalues, ytest)
  accuracyscore.append(acc)

avg_accuracyscore = sum(accuracyscore)/k

print('accuracy of each fold - {}'.format(accuracyscore))
print('Avg accuracy : {}'.format(avg_accuracyscore))

accuracy of each fold - [0.7541875, 0.760375, 0.75325, 0.7574375, 0.7573125]
Avg accuracy : 0.7565125


## **AdaBoost , k = 7**

In [10]:
k = 7
kf = KFold(n_splits=k, random_state=None)
model = OneVsRestClassifier(AdaBoostClassifier())

In [11]:
accuracyscore = []
for train_index, test_index in kf.split(X):
  xtrain , xtest = X.iloc[train_index,:],X.iloc[test_index,:]
  ytrain , ytest = y.iloc[train_index] , y.iloc[test_index]
  
  model.fit(xtrain, ytrain)
  predvalues = model.predict(xtest)
  acc = metrics.accuracy_score(predvalues, ytest)
  accuracyscore.append(acc)

avg_accuracyscore = sum(accuracyscore)/k

print('accuracy of each fold - {}'.format(accuracyscore))
print('Avg accuracy : {}'.format(avg_accuracyscore))

accuracy of each fold - [0.7571091084084347, 0.7568466182518155, 0.7523842855892904, 0.7541342199667512, 0.7584879243962198, 0.7600630031501575, 0.7502625131256563]
Avg accuracy : 0.7556125246983321


## **AdaBoost , k = 10**

In [12]:
k = 10
kf = KFold(n_splits=k, random_state=None)
model = OneVsRestClassifier(AdaBoostClassifier())

In [13]:
accuracyscore = []
for train_index, test_index in kf.split(X):
  xtrain , xtest = X.iloc[train_index,:],X.iloc[test_index,:]
  ytrain , ytest = y.iloc[train_index] , y.iloc[test_index]
  
  model.fit(xtrain, ytrain)
  predvalues = model.predict(xtest)
  acc = metrics.accuracy_score(predvalues, ytest)
  accuracyscore.append(acc)

avg_accuracyscore = sum(accuracyscore)/k

print('accuracy of each fold - {}'.format(accuracyscore))
print('Avg accuracy : {}'.format(avg_accuracyscore))

accuracy of each fold - [0.75575, 0.75725, 0.759625, 0.759125, 0.749625, 0.755125, 0.755625, 0.76175, 0.758375, 0.746375]
Avg accuracy : 0.7558625000000001
