In [1]:
import sklearn
import numpy as np
from numpy import random
import pandas as pd
import io
import requests
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
import os
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import time

In [2]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.21.2.


In [3]:
def load_dataset(path, urls):
    if not os.path.exists(path):
        os.mkdir(path)

    for url in urls:
        data = requests.get(url).content
        filename = os.path.join(path, os.path.basename(url))
        with open(filename, "wb") as file:
            file.write(data)

Adult Dataset Cleaning

In [4]:
urls = ["http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"]
load_dataset('data', urls)

In [5]:
columns = ["age", "workClass", "fnlwgt", "education", "education_num","maritalstatus", "occupation", "relationship",
          "race", "sex", "capitalgain", "capitalloss", "hours_per_week", "native_country", "income"]
train_data = pd.read_csv('data/adult.data', names=columns, sep=' *, *', na_values='?', engine='python')
test_data = pd.read_csv('data/adult.test', names=columns, sep=' *, *',na_values='?', engine='python')

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workClass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
maritalstatus     32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capitalgain       32561 non-null int64
capitalloss       32561 non-null int64
hours_per_week    32561 non-null int64
native_country    31978 non-null object
income            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16282 entries, 0 to 16281
Data columns (total 15 columns):
age               16282 non-null object
workClass         15318 non-null object
fnlwgt            16281 non-null float64
education         16281 non-null object
education_num     16281 non-null float64
maritalstatus     16281 non-null object
occupation        15315 non-null object
relationship      16281 non-null object
race              16281 non-null object
sex               16281 non-null object
capitalgain       16281 non-null float64
capitalloss       16281 non-null float64
hours_per_week    16281 non-null float64
native_country    16007 non-null object
income            16281 non-null object
dtypes: float64(5), object(10)
memory usage: 1.9+ MB


In [8]:
train_data = train_data.dropna()

In [9]:
test_data = test_data.dropna()

In [10]:
train_data = train_data.drop(['fnlwgt','education'], axis=1)
test_data = test_data.drop(['fnlwgt', 'education'], axis=1)

In [11]:
## one-hot encoding 
numerical_subset_train = train_data.select_dtypes('number')
categorical_subset_train = train_data.select_dtypes('object')
categorical_subset_train = categorical_subset_train.drop(['income'], axis=1)
categorical_subset_train = pd.get_dummies(categorical_subset_train)

In [12]:
## I found out that train and test set have different columns after one-hot encoding, and this is becasue the country 'Holand-Netherlands' does not exist in test data
# for convinience drop this row in train
categorical_subset_train = categorical_subset_train.drop(['native_country_Holand-Netherlands'],axis=1)
train = pd.concat([numerical_subset_train, categorical_subset_train], axis=1)

In [13]:
test_data['age'] = test_data['age'].astype(float)
numerical_subset_test = test_data.select_dtypes('number')
categorical_subset_test = test_data.select_dtypes('object')
categorical_subset_test = categorical_subset_test.drop(['income'], axis=1)
categorical_subset_test = pd.get_dummies(categorical_subset_test)

In [14]:
test = pd.concat([numerical_subset_test, categorical_subset_test], axis=1)

In [16]:
income_train = train_data['income'].replace({"<=50K": 0, ">50K": 1})
income_test = test_data['income'].replace({"<=50K.": 0, ">50K.": 1})
Y_adult = pd.concat([income_train, income_test], axis =0)
#Y = Y.dropna()

In [17]:
## check data balance
pd.Series(Y_adult).value_counts(normalize=True)

0    0.752156
1    0.247844
Name: income, dtype: float64

In [18]:
## scale train data, only the numerical ones
train_fit = StandardScaler().fit(numerical_subset_train)
train_sc = train_fit.transform(numerical_subset_train)
## scale test data
test_fit = StandardScaler().fit(numerical_subset_test)
test_sc = test_fit.transform(numerical_subset_test)
## combine, get X
X_adult_train= np.hstack((train_sc, categorical_subset_train))
X_adult_test= np.hstack((test_sc, categorical_subset_test))
X_adult = np.vstack((X_adult_train, X_adult_test))

In [19]:
##def logistic_classifier (train_X, train_Y, test_X, test_Y):
    #C = [10**-8, 10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**1, 10**2, 10**3, 10**4, 0]
    #grid={"C":C, "penalty":["l1","l2"]}
    #scoring = ['accuracy', 'roc_auc', 'f1']
    #classifier = GridSearchCV(LogisticRegression(solver = 'liblinear'), C, cv=5, scoring = scoring, n_jobs=-1)
    #classifier.fit(train_X, train_Y)

In [20]:
Y_adult.shape

(45222,)

Covertype Cleaning

In [21]:
covtype = pd.read_csv('Downloads\covtype.csv', sep=' *, *', na_values='?', engine='python')

In [22]:
##Turn Y into binary, calculate class distribution
index7 = covtype['Cover_Type'] == 7
Y_cov = np.where(index7, 1, 0)
pd.Series(Y_cov).value_counts(normalize=True)

0    0.9647
1    0.0353
dtype: float64

In [23]:
X_cov = covtype.drop(['Cover_Type'], axis=1)
fit_cov = StandardScaler().fit(X_cov)
X_cov = fit_cov.transform(X_cov)

Letter O Positive Cleaning

In [24]:
letter = pd.read_csv('Downloads\letter-recognition.csv', sep=' *, *', na_values='?', engine='python')

In [25]:
##Turn Y into binary, calculate class distribution
letterO_Y = np.zeros(len(letter))
for i in range(len(letter)):
    if letter['letter'][i] == 'O':
        letterO_Y[i] += 1
    else:
        letterO_Y[i] += 0
        
pd.Series(letterO_Y).value_counts(normalize=True)

0.0    0.96235
1.0    0.03765
dtype: float64

In [26]:
## define Y and X, standaridize 
Y_letterO = letterO_Y
X_letterO = letter.drop(['letter'], axis=1)
fit_lo = StandardScaler().fit(X_letterO)
X_letterO = fit_lo.transform(X_letterO)

Letter AM Positive Cleaning

In [27]:
AM = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']
letterO_AM = np.zeros(len(letter))
for i in range(len(letter)):
    if letter['letter'][i] in AM:
        letterO_AM[i] +=1
    else:
        letterO_AM += 0
        
pd.Series(letterO_AM).value_counts(normalize=True)

0.0    0.503
1.0    0.497
dtype: float64

In [28]:
Y_letterAM = letterO_AM
X_letterAM = letter.drop(['letter'], axis=1)
fit_AM = StandardScaler().fit(X_letterAM)
X_letterAM = fit_AM.transform(X_letterAM)

In [29]:
## Logistic Regression for Adult
def logistic_classifier(X, Y):
    
    mean_test_acc = []
    
    opt_train_acc = []
    opt_train_f1= []
    opt_train_auc = []
    
    opt_test_acc = []
    opt_test_f1= []
    opt_test_auc = []
    
    for trial in range(5):
    
        # define parameters
        C = [10**-8, 10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**1, 10**2, 10**3, 10**4]
        grid={"C":C, "penalty":["l1","l2"]}
        scoring = ['accuracy', 'roc_auc', 'f1']
    
        # pick random samples
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=len(X)-5000, train_size=5000, random_state=trial, shuffle=True)

        # define classifier, fit train Run Gridsearch
        classifier = GridSearchCV(LogisticRegression(solver = 'liblinear',class_weight = 'balanced'), grid, cv=5, scoring = scoring, refit = False, n_jobs= -1)
        classifier.fit(X_train, Y_train)
    
        # store best parameters for each metric for training and get accuracy score
        best = []
        best_index = np.argmin(classifier.cv_results_['rank_test_accuracy'])
        best_param = classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        best_index = np.argmin(classifier.cv_results_['rank_test_roc_auc'])
        best_param = classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        best_index = np.argmin(classifier.cv_results_['rank_test_f1'])
        best_param = classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        mean_test = classifier.cv_results_['mean_test_accuracy']
        mean_test_acc.append(mean_test)
    
        # train samples using each best metric
        model1 = LogisticRegression(C=best[0]['C'], penalty=best[0]['penalty'], solver='liblinear',class_weight = 'balanced')
        model1.fit(X_train, Y_train)
    
        model2 = LogisticRegression(C=best[1]['C'], penalty=best[1]['penalty'], solver='liblinear',class_weight = 'balanced')
        model2.fit(X_train, Y_train)
    
        model3 = LogisticRegression(C=best[2]['C'], penalty=best[2]['penalty'], solver='liblinear',class_weight = 'balanced')
        model3.fit(X_train, Y_train)
        
        # get score for train set
        acc_train = []
        for i in model1, model2, model3:
            acc_train.append(accuracy_score(Y_train, i.predict(X_train)))
        opt_train_acc.extend(acc_train)
    
        f1_train = []
        for i in model1, model2, model3:
            f1_train.append(f1_score(Y_train, i.predict(X_train)))
        opt_train_f1.extend(f1_train)

        auc_train = []
        for i in model1, model2, model3:
            auc_train.append(roc_auc_score(Y_train, i.predict(X_train)))
        opt_train_auc.extend(auc_train)
        
        
        # get score for test set
        acc_test = []
        for i in model1, model2, model3:
            acc_test.append(accuracy_score(Y_test, i.predict(X_test)))
        opt_test_acc.extend(acc_test)
    
        f1_test = []
        for i in model1, model2, model3:
            f1_test.append(f1_score(Y_test, i.predict(X_test)))
        opt_test_f1.extend(f1_test)

        auc_test = []
        for i in model1, model2, model3:
            auc_test.append(roc_auc_score(Y_test, i.predict(X_test)))
        opt_test_auc.extend(auc_test)
    
    return best, mean_test_acc, opt_train_acc, opt_train_f1, opt_train_auc, opt_test_acc, opt_test_f1, opt_test_auc

In [30]:
## SVM
def svm_classifier(X, Y):
    
    mean_test_acc = []
    
    opt_train_acc = []
    opt_train_f1= []
    opt_train_auc = []
    
    opt_test_acc = []
    opt_test_f1= []
    opt_test_auc = []

    for trial in range(5):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=len(X)-5000, train_size=5000, random_state=trial, shuffle=True)
        svm_param = [{'kernel': ['rbf'], 'gamma': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1,2], 'C': [10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**1, 10**2, 10**3, 10**4]},
                {'kernel': ['linear'], 'C': [10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**1, 10**2, 10**3, 10**4]},
                {'kernel': ['poly'], 'degree': [2, 3], 'C': [10**-7, 10**-6, 10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**1, 10**2, 10**3, 10**4]}]
        scoring = ['accuracy', 'roc_auc', 'f1']
        svm_classifier = GridSearchCV(svm.SVC(class_weight = 'balanced'), svm_param, cv=5, scoring = scoring, refit = False, n_jobs=-1)
        svm_classifier.fit(X_train, Y_train)
    
        ## store best parameters for each metric
        best = []
        best_index = np.argmin(svm_classifier.cv_results_['rank_test_accuracy'])
        best_param = svm_classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        best_index = np.argmin(svm_classifier.cv_results_['rank_test_roc_auc'])
        best_param = svm_classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        best_index = np.argmin(svm_classifier.cv_results_['rank_test_f1'])
        best_param = svm_classifier.cv_results_['params'][best_index]
        best.append(best_param)
        
        mean_test = svm_classifier.cv_results_['mean_test_accuracy']
        mean_test_acc.append(mean_test)
        
        ## train samples using each best metric
        params_svm1=best[0]
        model1 = svm.SVC(class_weight='balanced').set_params(**params_svm1)
        model1.fit(X_train, Y_train)
    
        params_svm2=best[1]
        model2 = svm.SVC(class_weight='balanced').set_params(**params_svm2)
        model2.fit(X_train, Y_train)
    
        params_svm3=best[2]
        model3 = svm.SVC(class_weight='balanced').set_params(**params_svm3)
        model3.fit(X_train, Y_train)
        
        # get score for train set
        acc_train = []
        for i in model1, model2, model3:
            acc_train.append(accuracy_score(Y_train, i.predict(X_train)))
        opt_train_acc.extend(acc_train)
    
        f1_train = []
        for i in model1, model2, model3:
            f1_train.append(f1_score(Y_train, i.predict(X_train)))
        opt_train_f1.extend(f1_train)

        auc_train = []
        for i in model1, model2, model3:
            auc_train.append(roc_auc_score(Y_train, i.predict(X_train)))
        opt_train_auc.extend(auc_train)
        
        ## get score for test set
        acc_test = []
        for i in model1, model2, model3:
            acc_test.append(accuracy_score(Y_test, i.predict(X_test)))
        opt_test_acc.extend(acc_test)
    
        f1_test = []
        for i in model1, model2, model3:
            f1_test.append(f1_score(Y_test, i.predict(X_test)))
        opt_test_f1.extend(f1_test)

        auc_test = []
        for i in model1, model2, model3:
            auc_test.append(roc_auc_score(Y_test, i.predict(X_test)))
        opt_test_auc.extend(auc_test)
    
    return best, mean_test_acc, opt_train_acc, opt_train_f1, opt_train_auc, opt_test_acc, opt_test_f1, opt_test_auc

In [47]:
def RF_classifier(X, Y):
    
    mean_test_acc = []
    
    opt_train_acc = []
    opt_train_f1= []
    opt_train_auc = []
    
    opt_test_acc = []
    opt_test_f1= []
    opt_test_auc = []
    
    for trial in range(5):
    
        # define parameters
        trees= [1024]
        max_features = [1,2,4,6,8,12,16, 20]
        grid = {'n_estimators': trees, 'max_features': max_features}
        scoring = ['accuracy', 'roc_auc', 'f1']
                    
        # pick random samples
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=len(X)-5000, train_size=5000, random_state=trial, shuffle=True)

        # define classifier, fit train Run Gridsearch
        classifier = GridSearchCV(RandomForestClassifier(class_weight = 'balanced'), grid, cv=5, scoring = scoring, refit = False, n_jobs=-1)
        classifier.fit(X_train, Y_train)
        
        # store best parameters for each metric for training and get accuracy score
        best = []
        best_index = np.argmin(classifier.cv_results_['rank_test_accuracy'])
        best_param = classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        best_index = np.argmin(classifier.cv_results_['rank_test_roc_auc'])
        best_param = classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        best_index = np.argmin(classifier.cv_results_['rank_test_f1'])
        best_param = classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        mean_test = classifier.cv_results_['mean_test_accuracy']
        mean_test_acc.append(mean_test)
    
        # train samples using each best metric
        model1 = RandomForestClassifier(max_features =best[0]['max_features'], class_weight='balanced')
        model1.fit(X_train, Y_train)
    
        model2 = RandomForestClassifier(max_features =best[1]['max_features'], class_weight = 'balanced')
        model2.fit(X_train, Y_train)
    
        model3 = RandomForestClassifier(max_features =best[2]['max_features'], class_weight = 'balanced')
        model3.fit(X_train, Y_train)
        
        # get score for train set
        acc_train = []
        for i in model1, model2, model3:
            acc_train.append(accuracy_score(Y_train, i.predict(X_train)))
        opt_train_acc.extend(acc_train)

        auc_train = []
        for i in model1, model2, model3:
            auc_train.append(roc_auc_score(Y_train, i.predict(X_train)))
        opt_train_auc.extend(auc_train)
        
        f1_train = []
        for i in model1, model2, model3:
            f1_train.append(f1_score(Y_train, i.predict(X_train)))
        opt_train_f1.extend(f1_train)
        
        # get score for test set
        acc_test = []
        for i in model1, model2, model3:
            acc_test.append(accuracy_score(Y_test, i.predict(X_test)))
        opt_test_acc.extend(acc_test)
    
        f1_test = []
        for i in model1, model2, model3:
            f1_test.append(f1_score(Y_test, i.predict(X_test))) 
        opt_test_f1.extend(f1_test)

        auc_test = []
        for i in model1, model2, model3:
            auc_test.append(roc_auc_score(Y_test, i.predict(X_test)))
        opt_test_auc.extend(auc_test)
    
    return best, mean_test_acc, opt_train_acc, opt_train_f1, opt_train_auc, opt_test_acc, opt_test_f1, opt_test_auc

In [48]:
def RF_classifier_forletter(X, Y):
    
    mean_test_acc = []
    
    opt_train_acc = []
    opt_train_f1= []
    opt_train_auc = []
    
    opt_test_acc = []
    opt_test_f1= []
    opt_test_auc = []
    
    for trial in range(5):
    
        # define parameters
        trees= [1024]
        max_features = [1,2,4,6,8,12,16]
        grid = {'n_estimators': trees, 'max_features': max_features}
        scoring = ['accuracy', 'roc_auc', 'f1']
                    
        # pick random samples
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=len(X)-5000, train_size=5000, random_state=trial, shuffle=True)

        # define classifier, fit train Run Gridsearch
        classifier = GridSearchCV(RandomForestClassifier(class_weight = 'balanced'), grid, cv=5, scoring = scoring, refit = False, n_jobs=-1)
        classifier.fit(X_train, Y_train)
        
        # store best parameters for each metric for training and get accuracy score
        best = []
        best_index = np.argmin(classifier.cv_results_['rank_test_accuracy'])
        best_param = classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        best_index = np.argmin(classifier.cv_results_['rank_test_roc_auc'])
        best_param = classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        best_index = np.argmin(classifier.cv_results_['rank_test_f1'])
        best_param = classifier.cv_results_['params'][best_index]
        best.append(best_param)
    
        mean_test = classifier.cv_results_['mean_test_accuracy']
        mean_test_acc.append(mean_test)
    
        # train samples using each best metric
        model1 = RandomForestClassifier(max_features =best[0]['max_features'], class_weight='balanced')
        model1.fit(X_train, Y_train)
    
        model2 = RandomForestClassifier(max_features =best[1]['max_features'], class_weight = 'balanced')
        model2.fit(X_train, Y_train)
    
        model3 = RandomForestClassifier(max_features =best[2]['max_features'], class_weight = 'balanced')
        model3.fit(X_train, Y_train)
        
        # get score for train set
        acc_train = []
        for i in model1, model2, model3:
            acc_train.append(accuracy_score(Y_train, i.predict(X_train)))
        opt_train_acc.extend(acc_train)

        auc_train = []
        for i in model1, model2, model3:
            auc_train.append(roc_auc_score(Y_train, i.predict(X_train)))
        opt_train_auc.extend(auc_train)
        
        f1_train = []
        for i in model1, model2, model3:
            f1_train.append(f1_score(Y_train, i.predict(X_train)))
        opt_train_f1.extend(f1_train)
        
        # get score for test set
        acc_test = []
        for i in model1, model2, model3:
            acc_test.append(accuracy_score(Y_test, i.predict(X_test)))
        opt_test_acc.extend(acc_test)
    
        f1_test = []
        for i in model1, model2, model3:
            f1_test.append(f1_score(Y_test, i.predict(X_test))) 
        opt_test_f1.extend(f1_test)

        auc_test = []
        for i in model1, model2, model3:
            auc_test.append(roc_auc_score(Y_test, i.predict(X_test)))
        opt_test_auc.extend(auc_test)
    
    return best, mean_test_acc, opt_train_acc, opt_train_f1, opt_train_auc, opt_test_acc, opt_test_f1, opt_test_auc

In [44]:
X_letterO.shape[1]

16

In [32]:
start = time.time()
adult_logi = logistic_classifier(X_adult, Y_adult)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")

Runtime of the program is 59.811641693115234


In [33]:
start = time.time()
adult_svm = svm_classifier(X_adult, Y_adult)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")



Runtime of the program is 20453.943395376205


In [34]:
start = time.time()
adult_rf = RF_classifier(X_adult, Y_adult)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")



Runtime of the program is 474.0063316822052


In [35]:
start = time.time()
cov_logi = logistic_classifier(X_cov, Y_cov)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Runtime of the program is 1500.794606924057


In [36]:
start = time.time()
cov_svm = svm_classifier(X_cov, Y_cov)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")



Runtime of the program is 9453.8247320652


In [37]:
start = time.time()
cov_rf = RF_classifier(X_cov, Y_cov)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")



Runtime of the program is 355.7090907096863


In [38]:
start = time.time()
letterO_logi = logistic_classifier(X_letterO, Y_letterO)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Runtime of the program is 6.596489191055298


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [39]:
start =time.time()
letterO_svm = svm_classifier(X_letterO, Y_letterO)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")



Runtime of the program is 21463.724128246307


In [49]:
start = time.time()
letterO_rf = RF_classifier_forletter(X_letterO, Y_letterO)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")



Runtime of the program is 268.1994228363037


In [50]:
start = time.time()
letterAM_logi = logistic_classifier(X_letterAM, Y_letterAM)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")

Runtime of the program is 6.788654565811157


In [52]:
start = time.time()
letterAM_svm = svm_classifier(X_letterAM, Y_letterAM)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")



Runtime of the program is 26007.7803709507


In [53]:
start = time.time()
letterAM_rf = RF_classifier_forletter(X_letterAM, Y_letterAM)
end = time.time()

# total time taken
print(f"Runtime of the program is {end - start}")



Runtime of the program is 457.0757746696472
