In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV


In [0]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [19]:
# Creating the dictionary to determine the type of glass
type_of_glass = {
                 1: "building_windows_float_processed",
                 2: "building_windows_non_float_processed",
                 3: "vehicle_windows_float_processed",
                 4: "vehicle_windows_non_float_processed",
                 5: "containers",
                 6: "tableware",
                 7: "headlamps"
                }

# reading the glass dataset
dataset = pd.read_csv('glass.csv')
dataset.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [22]:
# determining the correlation between the variables
dataset.corr()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
RI,1.0,-0.191885,-0.122274,-0.407326,-0.542052,-0.289833,0.810403,-0.000386,0.14301,-0.164237
Na,-0.191885,1.0,-0.273732,0.156794,-0.069809,-0.266087,-0.275442,0.326603,-0.241346,0.502898
Mg,-0.122274,-0.273732,1.0,-0.481799,-0.165927,0.005396,-0.44375,-0.492262,0.08306,-0.744993
Al,-0.407326,0.156794,-0.481799,1.0,-0.005524,0.325958,-0.259592,0.479404,-0.074402,0.598829
Si,-0.542052,-0.069809,-0.165927,-0.005524,1.0,-0.193331,-0.208732,-0.102151,-0.094201,0.151565
K,-0.289833,-0.266087,0.005396,0.325958,-0.193331,1.0,-0.317836,-0.042618,-0.007719,-0.010054
Ca,0.810403,-0.275442,-0.44375,-0.259592,-0.208732,-0.317836,1.0,-0.112841,0.124968,0.000952
Ba,-0.000386,0.326603,-0.492262,0.479404,-0.102151,-0.042618,-0.112841,1.0,-0.058692,0.575161
Fe,0.14301,-0.241346,0.08306,-0.074402,-0.094201,-0.007719,0.124968,-0.058692,1.0,-0.188278
Type,-0.164237,0.502898,-0.744993,0.598829,0.151565,-0.010054,0.000952,0.575161,-0.188278,1.0


In [36]:
X = dataset.iloc[:,0:9].values
y = dataset.iloc[:,9:10].values.ravel()
X

array([[ 1.52101, 13.64   ,  4.49   , ...,  8.75   ,  0.     ,  0.     ],
       [ 1.51761, 13.89   ,  3.6    , ...,  7.83   ,  0.     ,  0.     ],
       [ 1.51618, 13.53   ,  3.55   , ...,  7.78   ,  0.     ,  0.     ],
       ...,
       [ 1.52065, 14.36   ,  0.     , ...,  8.44   ,  1.64   ,  0.     ],
       [ 1.51651, 14.38   ,  0.     , ...,  8.48   ,  1.57   ,  0.     ],
       [ 1.51711, 14.23   ,  0.     , ...,  8.62   ,  1.67   ,  0.     ]])

In [37]:
# standardizing the values of X array with standard scaler
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
X

array([[ 0.87286765,  0.28495326,  1.25463857, ..., -0.14576634,
        -0.35287683, -0.5864509 ],
       [-0.24933347,  0.59181718,  0.63616803, ..., -0.79373376,
        -0.35287683, -0.5864509 ],
       [-0.72131806,  0.14993314,  0.60142249, ..., -0.82894938,
        -0.35287683, -0.5864509 ],
       ...,
       [ 0.75404635,  1.16872135, -1.86551055, ..., -0.36410319,
         2.95320036, -0.5864509 ],
       [-0.61239854,  1.19327046, -1.86551055, ..., -0.33593069,
         2.81208731, -0.5864509 ],
       [-0.41436305,  1.00915211, -1.86551055, ..., -0.23732695,
         3.01367739, -0.5864509 ]])

In [0]:
# Splitting the data into training and testing set using sklearn train test split method
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size =0.25, random_state = 0)

In [0]:
def fit_model(model, X_train = X_train, y_train = y_train, X_test = X_test) :
  """
   This method takes the input parameters for model, training and testing data 
   and fit the model with training data and return the prediction values
  """
  model.fit(X_train,y_train)
  pred = model.predict(X_test)
  return pred

In [0]:
def calculate_cross_val_score(estimator, X_train = X_train, y_train = y_train):
  """
  This method takes model as estimater along with training dat nd calculates and
  return the cross validation scores 
  """
  scores = cross_val_score(estimator, X_train,y_train,cv=6)
  print(scores.mean())
  return scores

In [61]:
# Training the data using Logistic regression
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression(random_state=0)
log_pred = fit_model(log_clf)
log_pred

array([7, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 7, 1, 1, 1, 2, 5, 1,
       7, 7, 1, 1, 7, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 7, 7, 6, 2, 1,
       1, 2, 1, 2, 2, 2, 1, 6, 7, 1])

In [62]:
# calculating cross validation score for logistic regression 
calculate_cross_val_score(log_clf)

0.6198988931460195


array([0.62068966, 0.5       , 0.74074074, 0.62962963, 0.52      ,
       0.70833333])

In [63]:
# Training hte data using Support vector machine classifier
from sklearn.svm import SVC
svc_clf = SVC()
svc_pred = fit_model(svc_clf)
svc_pred

array([7, 1, 2, 6, 5, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 7, 1, 2, 1, 2, 5, 1,
       7, 7, 1, 1, 7, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 7, 2, 2, 2, 1,
       2, 2, 1, 2, 1, 2, 1, 2, 7, 1])

In [64]:
calculate_cross_val_score(svc_clf)

0.6970280666545033


array([0.68965517, 0.60714286, 0.7037037 , 0.66666667, 0.64      ,
       0.875     ])

In [65]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_pred = fit_model(dt_clf)
dt_pred

array([7, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 2, 2, 2, 7, 3, 2, 1, 2, 5, 1,
       7, 7, 1, 1, 7, 1, 2, 2, 1, 1, 2, 3, 2, 1, 1, 3, 1, 7, 7, 6, 2, 1,
       2, 2, 1, 2, 1, 2, 1, 6, 7, 3])

In [66]:
calculate_cross_val_score(dt_clf)

0.6797097549109044


array([0.55172414, 0.67857143, 0.66666667, 0.62962963, 0.76      ,
       0.79166667])

In [79]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=40)
rf_pred = fit_model(rf_clf)
rf_pred

array([7, 1, 2, 6, 5, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 7, 3, 2, 3, 2, 5, 1,
       7, 7, 1, 1, 7, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 7, 7, 6, 2, 1,
       2, 2, 1, 2, 1, 2, 1, 6, 7, 1])

In [80]:
calculate_cross_val_score(rf_clf)

0.7332048592106064


array([0.75862069, 0.67857143, 0.7037037 , 0.66666667, 0.8       ,
       0.79166667])

In [76]:
parameters = [{'n_estimators':[30,32,34,36,38,40,42,44,46,48,50]}]
grid_search = GridSearchCV(estimator=rf_clf,param_grid=parameters,scoring='accuracy',cv=6,n_jobs=-1)
grid_search = grid_search.fit(X_train,y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print(best_accuracy)
print(best_parameters)

0.775
{'n_estimators': 34}




# We can see that random forest is the best classifier with 0.775 accuracy the best parameters are the decision tree with 34 as the estimator value