In [None]:
#ML Group Ensemble of SVMS
import pandas as pd 
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.svm import SVC 
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
import seaborn as sns 
import matplotlib.pyplot as plt 
from time import time

In [None]:
# INITIALISATION 
#open CSV file using pandas

mushroomQ = pd.read_csv('mushroom_quantitative.csv')

mushroom = pd.read_csv('mushroom_onehot.csv')

# Split training and testing sets by class column
XQ = mushroomQ.drop(columns=["class"])  
yQ = mushroomQ["class"] 

X = mushroom.drop(columns=["class b'p'"])   
y = mushroom["class b'p'"]                 

#k fold 
k = 10

# TRAINING
X_trainQ, X_testQ, y_trainQ, y_testQ = train_test_split(XQ, yQ, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Change C value 

In [None]:
timeGraph = []
types = []

#ONEHOT
for CVal in range(1,9):
    start = time()
    svmBag = SVC(kernel='poly',C =CVal,coef0=1)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    types.append('Poly')
      
    start = time()
    svmBag = SVC(kernel='rbf',C =CVal,degree=1)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    types.append('RBF')
    
    start = time()
    svmBag = SVC(kernel='linear',C =CVal)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    types.append('Linear')

print(timeGraph)
plt.figure(figsize=(5,5))
plt.title("Onehot Dataset - Time Taken")
plt.xlabel('Estimators')
plt.ylabel('Time Taken')
ax = sns.barplot(x=types,y=timeGraph)
plt.show()

In [None]:
timeGraph = []
types = []

#QUANTITATIVE
for CVal in range(1,9):
    start = time()
    svmBag = SVC(kernel='linear',C =CVal)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    types.append('Linear')
    
    start = time()
    svmBag = SVC(kernel='poly',C =CVal,coef0=1)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    types.append('Poly')
      
    start = time()
    svmBag = SVC(kernel='rbf',C =CVal,gamma='auto')
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    types.append('RBF')
    

print(timeGraph)
plt.figure(figsize=(5,5))
plt.title("Quantitative Dataset - Time Taken")
plt.xlabel('Estimators ')
plt.ylabel('Time Taken')
ax = sns.barplot(x=types,y=timeGraph)
plt.show()



### Poly change values coef0

In [None]:
timeGraph = []
types = []

#ONEHOT
for coefVal in range(1,9):
    start = time()
    svmBag = SVC(kernel='poly',C =1,coef0=coefVal)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    types.append(coefVal)


print(timeGraph)
plt.figure(figsize=(5,5))
plt.title("Onehot Dataset - Time Taken")
plt.xlabel('Coef Value')
plt.ylabel('Time Taken')
ax = sns.barplot(x=types,y=timeGraph)
plt.show()

In [None]:
timeGraph = []
types = []

#QUANTITATIVE
for coefVal in range(1,9):
    start = time()
    svmBag = SVC(kernel='poly',C =1,coef0=coefVal)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    types.append(coefVal)

      
print(timeGraph)
plt.figure(figsize=(5,5))
plt.title("Quantitative Dataset - Time Taken")
plt.xlabel('Coef Value')
plt.ylabel('Time Taken')
ax = sns.barplot(x=types,y=timeGraph)
plt.show()

### Poly change values degree

In [None]:
timeGraph = []
types = []

#ONEHOT
for degreeVal in range(1,9):
    start = time()
    svmBag = SVC(kernel='poly',C =1,coef0=2,degree =degreeVal)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    types.append(degreeVal)
    

print(timeGraph)
plt.figure(figsize=(5,5))
plt.title("Onehot Dataset - Time Taken")
plt.xlabel('Degree Value')
plt.ylabel('Time Taken')
ax = sns.barplot(x=types,y=timeGraph)
plt.show()

In [None]:
timeGraph = []
types = []

#QUANTITATIVE
for degreeVal in range(1,9):
    start = time()
    svmBag = SVC(kernel='poly',C =1,coef0=2,degree=degreeVal)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    types.append(degreeVal)

      
print(timeGraph)
plt.figure(figsize=(5,5))
plt.title("Quantitative Dataset - Time Taken")
plt.xlabel('Degree Value')
plt.ylabel('Time Taken')
ax = sns.barplot(x=types,y=timeGraph)
plt.show()

### N estimators change

In [None]:
timeGraph = []
types = []

#ONEHOT
for nEstimator in [5,10,15]:
    start = time()
    svmBag = SVC(kernel='poly',C =1,coef0=3,degree=4)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=nEstimator, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    
    start = time()
    svmBag = SVC(kernel='rbf',C =5)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=nEstimator, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
    
estimator = ['Poly:5','RBF:5','Poly:10','RBF:10','Poly:15','RBF:15']
print(timeGraph)
plt.figure(figsize=(5,5))
plt.title("Onehot Dataset - Time Taken")
plt.xlabel('Number of Estimator')
plt.ylabel('Time Taken')
ax = sns.barplot(x=estimator,y=timeGraph)
plt.xticks(rotation=45)
plt.show()

In [None]:
timeGraph = []
types = []

#QUANTITATIVE
for nEstimator in [5,10,15]:
    start = time()
    svmBag = SVC(kernel='poly',C =1,coef0=3,degree=4)
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=nEstimator, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)


    start = time()
    svmBag = SVC(kernel='rbf',C =1,gamma='auto')
    baggingEnsemble = BaggingClassifier(svmBag, n_estimators=nEstimator, random_state=42)
    f1Score = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="f1").mean()*100
    accuracy = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="accuracy").mean()*100
    end = time()
    result = end - start
    timeGraph.append(result)
 

  
estimator = ['Poly:5','RBF:5','Poly:10','RBF:10','Poly:15','RBF:15']
print(timeGraph)
plt.figure(figsize=(5,5))
plt.title("Quantitative Dataset - Time Taken")
plt.xlabel('Number of Estimator')
plt.ylabel('Time Taken')
ax = sns.barplot(x=estimator,y=timeGraph)
plt.xticks(rotation=45)
plt.show()

### Final test

In [None]:
timeGraph = []
types = []

#train and split data for the 20% testing
mushroomendq = pd.read_csv('mushroom_onehot.csv')
Xendq = mushroomendq.drop(columns=["class b'p'"])  # features 
yendq = mushroomendq["class b'p'"]                 # target 
X_trainEndQ, X_testEndQ, y_trainEndQ, y_testEndQ = train_test_split(Xendq, yendq, test_size=0.2, random_state=42)

mushroomend = pd.read_csv('mushroom_onehot.csv')
Xend = mushroomend.drop(columns=["class b'p'"])  # features 
yend = mushroomend["class b'p'"]                 # target 
X_trainEnd, X_testEnd, y_trainEnd, y_testEnd = train_test_split(Xend, yend, test_size=0.2, random_state=42)

#Quantitative 20%
start = time()
svmBag = SVC(kernel='poly',C =1,coef0=3,degree=4)
baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
baggingEnsemble.fit(X_trainEndQ, y_trainEndQ)
y_predq = baggingEnsemble.predict(X_testEndQ)
end = time()
result = end - start
timeGraph.append(result)

    
start = time()
svmBag = SVC(kernel='rbf',C =1,gamma='auto')
baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
baggingEnsemble.fit(X_trainEnd, y_trainEnd)
y_pred = baggingEnsemble.predict(X_testEnd)
end = time()
result = end - start
timeGraph.append(result)


#Onehot 20%
start = time()
svmBag = SVC(kernel='poly',C =1,coef0=3,degree=4)
baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
baggingEnsemble.fit(X_train, y_train)
y_pred = baggingEnsemble.predict(X_test)
end = time()
result = end - start
timeGraph.append(result)

    
start = time()
svmBag = SVC(kernel='rbf',C =1,gamma='auto')
baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
baggingEnsemble.fit(X_train, y_train)
y_pred = baggingEnsemble.predict(X_test)
end = time()
result = end - start
timeGraph.append(result)

#onehot
start = time()
svmBag = SVC(kernel='poly',C =1,coef0=3,degree=4)
baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
f1Score = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="f1").mean()*100
accuracy = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="accuracy").mean()*100
end = time()
result = end - start
timeGraph.append(result)

    
start = time()
svmBag = SVC(kernel='rbf',C =5,gamma='auto')
baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
f1Score = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="f1").mean()*100
accuracy = cross_val_score(baggingEnsemble,X_train,y_train,cv=k,scoring="accuracy").mean()*100
end = time()
result = end - start
timeGraph.append(result)



#Quantitative
start = time()
svmBag = SVC(kernel='poly',C =1,coef0=3,degree=4)
baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
f1Score = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="f1").mean()*100
accuracy = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="accuracy").mean()*100
end = time()
result = end - start
timeGraph.append(result)

    
start = time()
svmBag = SVC(kernel='rbf',C =1,gamma='auto')
baggingEnsemble = BaggingClassifier(svmBag, n_estimators=5, random_state=42)
f1Score = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="f1").mean()*100
accuracy = cross_val_score(baggingEnsemble,X_trainQ,y_trainQ,cv=k,scoring="accuracy").mean()*100
end = time()
result = end - start
timeGraph.append(result)


    
    
estimator = ['Poly 20% Quantitative','RBF 20% Quantitative','Poly 20% Onehot','RBF 20% Onehot','Poly Quantitative','RBF Quantitative','Poly Onehot','RBF Onehot']
print(timeGraph)
plt.figure(figsize=(5,5))
plt.title("Final Test - Time Taken")
plt.xlabel('Dataset and Estimator')
plt.ylabel('Time Taken')
ax = sns.barplot(x=estimator,y=timeGraph)
plt.xticks(rotation=90)
plt.show()