## Importing data and libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
from numpy import arange
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn import svm
from sklearn import metrics
from sklearn import tree
from sklearn import ensemble
from xgboost import XGBClassifier
from sklearn import neighbors
from sklearn import linear_model
from sklearn.decomposition import PCA 
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import math

In [None]:
pd.set_option('display.max_columns',214)
pd.set_option('display.max_rows',214)

In [None]:
def import_data():
    df = pd.read_csv('Glass.csv')
    # df.set_index('ID', inplace=True)
    return df

In [None]:
df = import_data()
df

## Distribution of instances among classes

In [None]:
def show_counts(data):
    class_count  = data['Class'].value_counts()
    plt.figure(figsize=(10,5))
    sns.barplot(x=class_count.index, y=class_count.values, alpha=0.8)
    plt.title('Glass classes counts')
    plt.ylabel('Number of instances', fontsize=12)
    plt.xlabel('Class', fontsize=12)
    plt.show()

show_counts(df)
# we can see that first two classes contain most (over 67%) of the observations)

In [None]:
col_names=df.columns
col_names_glass = [x for x in list(df) if x not in ['ID','Class']]

In [None]:
def box_plots(data):    
    sns.set(style="whitegrid", font_scale=1.2)
    plt.subplots(figsize = (20,15))
    for n in col_names_glass:
        plt.subplot(3,3,(col_names_glass.index(n)+1))
        sns.boxplot(x='Class', y=n, data=data)
box_plots(df) 

In [None]:
df.loc[:, df.columns.difference(['ID', 'Class'])].describe()

# Scaling data

In [None]:
def scale_data(data):
    scaler = preprocessing.MinMaxScaler()
    x = df[col_names_glass].values
    x_scaled = scaler.fit_transform(x)
    df_temp = pd.DataFrame(x_scaled, columns=col_names_glass, index = df.index)
    df[col_names_glass] = df_temp
    return df

## Removing outliers

In [None]:
def remove_outliers(data):
    return data[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

# For each column, first it computes the Z-score of each value in the column, relative to the column mean and standard deviation.
# Then is takes the absolute of Z-score because the direction does not matter, only if it is below the threshold.
# all(axis=1) ensures that for each row, all column satisfy the constraint.
# Finally, result of this condition is used to index the dataframe.

## Final Wrangle

In [None]:
def wrangle(data):
    scaled = scale_data(data)             # scaling data
    no_outliers = remove_outliers(scaled) # removing outliers
    return no_outliers

In [None]:
new_df = wrangle(df)

In [None]:
def show_outliers(old_data, new_data):
    l=[]
    for x in range(0,215):
        if x not in new_data['ID']:
            l.append(x)
    return old_data[old_data['ID'].isin(l)]

show_outliers(df, new_df)

In [None]:
new_df.loc[:, new_df.columns.difference(['ID', 'Class'])].describe()

## Normality of distribution

In [None]:
#def kdeplots(data):
#    for col in col_names_glass:
#        sns.kdeplot(data[col], shade=True, color="r")
#        plt.show()

In [None]:
def kdeplots(data):    
    plt.subplots(figsize = (20,15))
    plt.subplot(3,3,1)
    sns.kdeplot(data['Ri'], shade=True, color="r")
    plt.subplot(3,3,2)
    sns.kdeplot(data['Na'], shade=True, color="r")
    plt.subplot(3,3,3)
    sns.kdeplot(data['Mg'], shade=True, color="r")
    plt.subplot(3,3,4)
    sns.kdeplot(data['Al'], shade=True, color="r")
    plt.subplot(3,3,5)
    sns.kdeplot(data['Si'], shade=True, color="r")
    plt.subplot(3,3,6)
    sns.kdeplot(data['K'], shade=True, color="r")
    plt.subplot(3,3,7)
    sns.kdeplot(data['Ca'], shade=True, color="r")
    plt.subplot(3,3,8)
    sns.kdeplot(data['Ba'], shade=True, color="r")
    plt.subplot(3,3,9)
    sns.kdeplot(data['Fe'], shade=True, color="r")

kdeplots(new_df)

In [None]:
plt.subplots(figsize=(15,10))
sns.heatmap(new_df.loc[:, df.columns.difference(['ID'])].corr(),annot=True,cmap='YlGn')

In [None]:
box_plots(new_df)

In [None]:
# For all histograms of each features after scaling

def histograms(data):
    data[col_names_glass].hist(bins = 50, figsize = (25,25), xlabelsize = 1, ylabelsize = 1)
    plt.show()
    
#histograms(df)
histograms(new_df)

In [None]:
def density_plots(data):
    sns.set(style="whitegrid", font_scale=1.3)
    plt.subplots(figsize = (35,30))
    for n in ['Ri','Na','Mg','Al','Si','Ca']:
        plt.subplot(3,3,(['Ri','Na','Mg','Al','Si','Ca'].index(n)+1))
        sns.kdeplot(data[n][data.Class == 1], shade = True)
        sns.kdeplot(data[n][data.Class  == 2], shade = True)
        sns.kdeplot(data[n][data.Class  == 3], shade = True)
        sns.kdeplot(data[n][data.Class  == 4], shade = True)
        sns.kdeplot(data[n][data.Class  == 5], shade = True)
        sns.kdeplot(data[n][data.Class  == 6], shade = True)
        sns.kdeplot(data[n][data.Class  == 7], shade = True)
        plt.title(f'{n} distribution among classes')
        plt.legend(['Class 1', 'Class 2', 'Class 3','Class 5', 'Class 6', 'Class 7'])
        
    plt.subplot(3,3,7)
    sns.kdeplot(data['K'][data.Class == 1], shade = True)
    sns.kdeplot(data['K'][data.Class == 2], shade = True)
    sns.kdeplot(data['K'][data.Class == 3], shade = True)
    sns.kdeplot(data['K'][data.Class == 5], shade = True)
    sns.kdeplot(data['K'][data.Class == 7], shade = True)
    plt.title('K distribution among classes')
    plt.legend(['Class 1', 'Class 2', 'Class 3','Class 5', 'Class 7'])
    
        
    plt.subplot(3,3,8)
    sns.kdeplot(data['Ba'][data.Class == 1], shade = True)
    sns.kdeplot(data['Ba'][data.Class == 2], shade = True)
    sns.kdeplot(data['Ba'][data.Class == 3], shade = True)
    sns.kdeplot(data['Ba'][data.Class == 7], shade = True)
    plt.title('Ba distribution among classes')
    plt.legend(['Class 1', 'Class 2', 'Class 3','Class 7'])
    
    plt.subplot(3,3,9)
    sns.kdeplot(data['Fe'][data.Class == 1], shade = True)
    sns.kdeplot(data['Fe'][data.Class == 2], shade = True)
    sns.kdeplot(data['Fe'][data.Class == 3], shade = True)
    sns.kdeplot(data['Fe'][data.Class == 5], shade = True)
    sns.kdeplot(data['Fe'][data.Class == 7], shade = True)
    plt.title('Fe distribution among classes')
    plt.legend(['Class 1', 'Class 2', 'Class 3','Class 5', 'Class 7'])
    
density_plots(new_df)

## Splitting data

In [None]:
X = new_df[col_names_glass] 
y = new_df['Class'] 
seed = 7
test_size = 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size , random_state = seed)

In [None]:
pca = PCA(random_state = seed)
pca.fit(X_train)
v = pca.explained_variance_ratio_
c = np.cumsum(v)
plt.figure(figsize=(10,10))
plt.ylabel('Variance ratio')
plt.xlabel('Principal components')
plt.xticks(np.arange(1,len(v)+1,1))
plt.bar(range(1,len(c)+1), v, align= 'center', label= 'individual variance', alpha = 0.5)
plt.step(range(1,len(c)+1), c, where = 'mid' , label= 'cumulative variance', color= 'orange')
plt.legend(loc='center')
plt.show()

## Algorithms

In [None]:
# SVM
svm = svm.SVC()
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
acc1 = metrics.balanced_accuracy_score(y_pred,y_test)
mse1 = metrics.mean_squared_error(y_test, y_pred)
rmse1 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

cm1 = confusion_matrix(y_pred,y_test)
x_axis_labels = [1,2,3,5,6,7]
y_axis_labels = [1,2,3,5,6,7]

sns.heatmap(cm1, annot=True, cmap="Greens", cbar=False, xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
# Decision Tree
trees = tree.DecisionTreeClassifier()
trees.fit(X_train, y_train)
y_pred = trees.predict(X_test)
acc2 = metrics.balanced_accuracy_score(y_pred,y_test)
mse2 = metrics.mean_squared_error(y_test, y_pred)
rmse2 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

cm2 = confusion_matrix(y_pred,y_test)
sns.heatmap(cm2, annot=True, cmap="Greens", cbar=False,xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
# Random Forest
forest = ensemble.RandomForestClassifier(max_depth = 3, min_samples_split=2, n_estimators = 50, random_state = seed)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
acc3 = metrics.balanced_accuracy_score(y_pred,y_test)
mse3 = metrics.mean_squared_error(y_test, y_pred)
rmse3 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
confusion_matrix(y_pred,y_test)

cm3 = confusion_matrix(y_pred,y_test)
sns.heatmap(cm3, annot=True, cmap="Greens", cbar=False,xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
# XGBoost
xgbMod = XGBClassifier(max_depth = 3, n_estimators = 100, learning_rate = 0.05, random_state = seed)
xgbMod.fit(X_train,y_train)
y_pred = xgbMod.predict(X_test)
acc4 = metrics.balanced_accuracy_score(y_pred,y_test)
mse4 = metrics.mean_squared_error(y_test, y_pred)
rmse4 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
confusion_matrix(y_pred,y_test)

cm4 = confusion_matrix(y_pred,y_test)
sns.heatmap(cm4, annot=True, cmap="Greens", cbar=False,xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
# KNN
knn = neighbors.KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc5 = metrics.balanced_accuracy_score(y_pred,y_test)
mse5 = metrics.mean_squared_error(y_test, y_pred)
rmse5 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
confusion_matrix(y_pred,y_test)

cm5 = confusion_matrix(y_pred,y_test)
sns.heatmap(cm5, annot=True, cmap="Greens", cbar=False,xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
# Logistic Regression
lgstc = linear_model.LogisticRegression(random_state=seed, solver="lbfgs", multi_class="multinomial",class_weight='balanced').fit(X_train, y_train)
lgstc.score(X_test, y_test)
lgstc = lgstc.fit(X_train, y_train)
y_pred = lgstc.predict(X_test)
acc6 = metrics.balanced_accuracy_score(y_test,y_pred)
mse6 = metrics.mean_squared_error(y_test, y_pred)
rmse6 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
confusion_matrix(y_pred,y_test)

cm6 = confusion_matrix(y_pred,y_test)
sns.heatmap(cm6, annot=True, cmap="Greens", cbar=False,xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel('true label')
plt.ylabel('predicted label')

## Results

In [None]:
results=pd.DataFrame(columns=['Algorithm','Accuracy','MSE'])
names=['SVM','Decision Tree','Random Forest','XGBoost','KNN','Logistic Regression']
for x in range(0,len(names)):
    results.loc[x] = [names[x],vars()['acc' + str(x+1)], vars()['mse' + str(x+1)]]
results.sort_values(by='Accuracy', ascending=False, inplace=True)
results.set_index('Algorithm', inplace=True)

In [None]:
results

## Ensembling&Tuning

# KNN

In [None]:
parameters_knn = {'n_neighbors': np.arange(1, 30)}
knn2 = GridSearchCV(knn, parameters_knn, cv=4)
knn2.fit(X_train, y_train)
knn_best = knn2.best_estimator_
print(knn2.best_params_)

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc23 = metrics.balanced_accuracy_score(y_test,y_pred)
mse23 = metrics.mean_squared_error(y_test, y_pred)
rmse23 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

cm23 = confusion_matrix(y_pred,y_test)
sns.heatmap(cm23, annot=True, cmap="Greens", cbar=False,xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel('true label')
plt.ylabel('predicted label')

# Random Forest

In [None]:
parameters = {'n_estimators' : [50, 100, 150, 200, 250, 275, 300, 350, 400],
'min_samples_split': np.arange(1, 5),
'max_depth': np.arange(1, 10)}

forest_t = GridSearchCV(forest, parameters, n_jobs=-1, verbose=2, refit = "accuracy_score")
forest_t.fit(X_train, y_train)

best_pars = forest_t.best_params_
best_pars

In [None]:
a=best_pars['n_estimators']
b=best_pars['max_depth']
c=best_pars['min_samples_split']

In [None]:
forest = ensemble.RandomForestClassifier(max_depth = b, min_samples_split=c, n_estimators = a, random_state = seed)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
acc24 = metrics.balanced_accuracy_score(y_pred,y_test)
mse24 = metrics.mean_squared_error(y_test, y_pred)
rmse24 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

cm24 = confusion_matrix(y_pred,y_test)
sns.heatmap(cm24, annot=True, cmap="Greens", cbar=False, xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel('true label')
plt.ylabel('predicted label')

# SVM

In [None]:
from sklearn.svm import SVC
svc = SVC()
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(svc,param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)
print(grid.best_estimator_)

In [None]:
from sklearn.svm import SVC
svm = SVC(C=10, gamma=1)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
acc25 = metrics.balanced_accuracy_score(y_pred,y_test)
mse25 = metrics.mean_squared_error(y_test, y_pred)
rmse25 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

cm25 = confusion_matrix(y_pred,y_test)
sns.heatmap(cm25, annot=True, cmap="Greens", cbar=False,xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel('true label')
plt.ylabel('predicted label')

# XGBoost

In [None]:
xgb_model = XGBClassifier()

parameters = {'n_estimators' : [120, 150, 180],
'learning_rate': [0.05, 0.01, 0.1],
'max_depth': [5,6,7]}

clf = GridSearchCV(xgb_model,
parameters,
n_jobs=-1,
verbose=2,
refit = "accuracy_score")
clf.fit(X_train, y_train)
    
best_pars = clf.best_params_
l_rate = best_pars['learning_rate']
m_depth = best_pars['max_depth']
n_estim = best_pars['n_estimators']

In [None]:
# XGBoost
xgbMod = XGBClassifier(max_depth = m_depth, n_estimators = n_estim, learning_rate = l_rate, random_state = seed)
xgbMod.fit(X_train,y_train)
y_pred = xgbMod.predict(X_test)
acc26 = metrics.balanced_accuracy_score(y_pred,y_test)
mse26 = metrics.mean_squared_error(y_test, y_pred)
rmse26 = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

cm26 = confusion_matrix(y_pred,y_test)
sns.heatmap(cm26, annot=True, cmap="Greens", cbar=False,xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
results2=pd.DataFrame(columns=['Algorithm','Accuracy','MSE'])
names=['KNN','Random Forest','SVM','XGBoost']
for x in range(0,len(names)):
    results2.loc[x] = [names[x],vars()['acc' + str(x+23)], vars()['mse' + str(x+23)]]
results2.sort_values(by='Accuracy', ascending=False, inplace=True)
results2.set_index('Algorithm', inplace=True)

In [None]:
results

In [None]:
results2