In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import SGDClassifier, LinearRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, score

In [3]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Length,Protocol,Info
0,0,66,1,4
1,0,66,1,1
2,0,54,1,6
3,0,712,0,12
4,0,1490,0,8


In [4]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

In [4]:
svm = SGDClassifier()
svm.fit(XTrain, yTrain)
score(svm, XTrain, yTrain, XTest, yTest)

Training score: 0.5316748768472906
Testing score: 0.5340360555610285


In [5]:
model = StackingClassifier(estimators = [("1", KNN()), ("2", DTC()), ("3", SGDClassifier())], final_estimator = LR(), n_jobs = -1, cv = 2)

In [6]:
grid = GridSearchCV(estimator=model, n_jobs = -1, param_grid={"1__n_neighbors": np.array(range(1,20)), "2__min_samples_split": np.array(range(2, 23)), "2__min_samples_leaf": np.array(range(1, 21))})

In [7]:
grid.fit(XTrain, yTrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
gridDataframe = pd.DataFrame(grid.cv_results_)
gridDataframe

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_1__n_neighbors,param_2__min_samples_leaf,param_2__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,7.629859,0.462699,0.684692,0.082848,1,1,2,"{'1__n_neighbors': 1, '2__min_samples_leaf': 1...",0.280911,0.539532,0.533005,0.379064,0.434113,0.433325,0.097356,285
1,6.774984,0.803847,0.716265,0.128255,1,1,3,"{'1__n_neighbors': 1, '2__min_samples_leaf': 1...",0.107512,0.428571,0.227094,0.206897,0.427709,0.279557,0.127895,7115
2,7.337408,0.724494,0.722882,0.115215,1,1,4,"{'1__n_neighbors': 1, '2__min_samples_leaf': 1...",0.230665,0.223892,0.233128,0.107266,0.233990,0.205788,0.049388,7959
3,6.617655,0.928886,0.768530,0.165877,1,1,5,"{'1__n_neighbors': 1, '2__min_samples_leaf': 1...",0.449877,0.441872,0.107635,0.239778,0.204433,0.288719,0.135439,6802
4,6.659969,0.754904,0.809512,0.180675,1,1,6,"{'1__n_neighbors': 1, '2__min_samples_leaf': 1...",0.433867,0.301601,0.233251,0.305419,0.150985,0.285025,0.093265,6942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7975,7.173061,0.245519,0.902595,0.110136,19,20,18,"{'1__n_neighbors': 19, '2__min_samples_leaf': ...",0.230911,0.422783,0.532882,0.237438,0.447783,0.374360,0.120158,2108
7976,6.723204,0.694331,0.887182,0.162853,19,20,19,"{'1__n_neighbors': 19, '2__min_samples_leaf': ...",0.242118,0.428325,0.439778,0.529064,0.438793,0.415616,0.094079,594
7977,7.492623,0.183937,0.659426,0.061190,19,20,20,"{'1__n_neighbors': 19, '2__min_samples_leaf': ...",0.250616,0.272291,0.291133,0.549015,0.232266,0.319064,0.116674,5479
7978,7.304334,0.285028,0.574999,0.075223,19,20,21,"{'1__n_neighbors': 19, '2__min_samples_leaf': ...",0.306404,0.432882,0.417980,0.535837,0.436453,0.425911,0.072922,423


In [9]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_1__n_neighbors,param_2__min_samples_leaf,param_2__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1708,7.155454,0.485967,0.830759,0.131974,5,2,9,"{'1__n_neighbors': 5, '2__min_samples_leaf': 2...",0.430911,0.54335,0.533251,0.52697,0.536084,0.514113,0.041932,1


In [33]:
model1 = GaussianNB()
model1.fit(XTrain, yTrain)
model1.score(XTest, yTest)

0.533937543099202

In [35]:
model2 = MLPClassifier()
model2.fit(XTrain, yTrain)
model2.score(XTest, yTest)

0.5755098019899517

In [5]:
model = StackingClassifier(estimators = [("1", KNN()), ("2", DTC()), ("3", MLPClassifier())], final_estimator = LR(), n_jobs = -1)

In [6]:
model.fit(XTrain, yTrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
score(model, XTrain, yTrain, XTest, yTest)

Training score: 0.7370689655172413
Testing score: 0.718746921485568
