# **Team Loan Canoe:** *Advanced Topics I Deliverable Workshop*

### **UCI Dataset for Deliverable:** *Glass Identification Data Set*

---

Vina conducted a comparison test of her rule-based system, BEAGLE, the nearest-neighbor algorithm, and discriminant analysis. BEAGLE is a product available through VRS Consulting, Inc.; 4676 Admiralty Way, Suite 206; Marina Del Ray, CA 90292 (213) 827-7890 and FAX: -3189. In determining whether the glass was a type of "float" glass or not, the following results were obtained (# incorrect answers):

Type of Sample -- Beagle -- NN -- DA
Windows that were float processed (87) -- 10 -- 12 -- 21
Windows that were not: (76) -- 19 -- 16 -- 22

The study of classification of types of glass was motivated by criminological investigation. At the scene of the crime, the glass left can be used as evidence...if it is correctly identified!

--  

***Attribute Information:***

1. Id number: 1 to 214
2. RI: refractive index
3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10)
4. Mg: Magnesium
5. Al: Aluminum
6. Si: Silicon
7. K: Potassium
8. Ca: Calcium
9. Ba: Barium
10. Fe: Iron
11. Type of glass: (class attribute)
-- 1 building_windows_float_processed
-- 2 building_windows_non_float_processed
-- 3 vehicle_windows_float_processed
-- 4 vehicle_windows_non_float_processed (none in this database)
-- 5 containers
-- 6 tableware
-- 7 headlamps

---

In [42]:
# Load libraries
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import numpy as np
import os.path

%matplotlib inline

import os
import json
import time
import pickle
import requests


import numpy as np
import pandas as pd
import pandas
import matplotlib.pyplot as plt

from sklearn.datasets.base import Bunch

In [43]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data"

def fetch_data(fname='glass.csv'):
    """
    Helper method to retreive the ML Repository dataset.
    """
    response = requests.get(URL)
    outpath  = os.path.abspath(fname)
    with open(outpath, 'wb') as f:
        f.write(response.content)
    
    return outpath

# Fetch the data if required
DATA = fetch_data()

In [44]:
FEATURES  = [
    'id_number', 
    'ri', 
    'na', 
    'mg', 
    'al', 
    'si', 
    'k', 
    'ca', 
    'ba', 
    'fe', 
    'class'
]

CLASS_LABEL_MAP = {
    1: 'building_windows_float_processed' ,
    2: 'building_windows_non_float_processed' ,
    3: 'vehicle_windows_float_processed' ,
    4: 'vehicle_windows_non_float_processed' ,
    5: 'containers' ,
    6: 'tableware' ,
    7: 'headlamps' ,
}
                   
# Read the data into a DataFrame
df = pd.read_csv(DATA, header=None, names=FEATURES)

# Convert class labels into text
df['class'] = df['class'].map(CLASS_LABEL_MAP)

# Describe the dataset
print(df.describe(include='all'))

         id_number          ri          na          mg          al  \
count   214.000000  214.000000  214.000000  214.000000  214.000000   
unique         NaN         NaN         NaN         NaN         NaN   
top            NaN         NaN         NaN         NaN         NaN   
freq           NaN         NaN         NaN         NaN         NaN   
mean    107.500000    1.518365   13.407850    2.684533    1.444907   
std      61.920648    0.003037    0.816604    1.442408    0.499270   
min       1.000000    1.511150   10.730000    0.000000    0.290000   
25%      54.250000    1.516523   12.907500    2.115000    1.190000   
50%     107.500000    1.517680   13.300000    3.480000    1.360000   
75%     160.750000    1.519157   13.825000    3.600000    1.630000   
max     214.000000    1.533930   17.380000    4.490000    3.500000   

                si           k          ca          ba          fe  \
count   214.000000  214.000000  214.000000  214.000000  214.000000   
unique         NaN 

In [45]:
# Determine the shape of the data
print("{} instances with {} features\n".format(*df.shape))

# Determine the frequency of each class
print(pd.crosstab(index=df['class'], columns="count"))

214 instances with 11 features

col_0                                 count
class                                      
building_windows_float_processed         70
building_windows_non_float_processed     76
containers                               13
headlamps                                29
tableware                                 9
vehicle_windows_float_processed          17


--

In [46]:
# Show contents of my data directory

DATA_DIR = os.getcwd()

for name in os.listdir(DATA_DIR):
    if name.startswith("."): continue
    print("- {}".format(name))

- glass.csv
- data.pkl
- 02_uci_glass__gpc_svm__bbz.ipynb


--

In [47]:
# Function for loading dataset 
def open_dataset():
    url = "http://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data"
    names = [
        'id',
        'refractive-index',
        'Sodium',
        'Magnesium',
        'Aluminum',
        'Silicon',
        'Potassium',
        'Calcium',
        'Barium',
        'Iron',
        'class'
    ] #return as a dataframe in pandas
    return pandas.read_csv(url, names=names) 

In [48]:
# Seperating the elements and type of glass

def split_data(dataset):
    # seperate the elements and type of glass
    array = dataset.values
    X = array[:, 1:10]
    Y = array[:, 10]
    return X, Y

In [49]:
# Preparing a traing data set from our data

def prepare_training(models):
    dataset = open_dataset()
    validation_size = 0.50
    seed = 7
    X, Y = split_data(dataset)
    # splits the dataset into a training set and a test set
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
        X,
        Y,
        test_size=validation_size,
        random_state=seed
    )
    train_models(models, X_train, Y_train, X_test, Y_test)

In [50]:
# Usig kfold to iterate through some models

def train_models(models, X_train, Y_train, X_test, Y_test):
    classifiers = []
    # iterates through the models
    for name, model in models:
        # chooses the index's for test and training set
        kfold = model_selection.KFold(n_splits=4)
        for traincv, testcv in kfold.split(X_train):
            # trains the models
            model.fit(X_train[traincv], Y_train[traincv])
            # tests the models, doesn't output the result
            model.predict(X_train[testcv])
        # final test on the original test set
        prediction = model.predict(X_test)
        print(name, accuracy_score(prediction, Y_test) * 100)
        with open(
            'pickle/' + name + '_classifier.pickle',
            'wb'
        ) as ph:
            pickle.dump(model, ph)
        classifiers.append((name, model))
    return classifiers

In [51]:
# Dictionary for Glass Types

glass_types = {
    '1.0': 'Building Windows Float Processed',
    '2.0': 'Building Windows Non Float Processed',
    '3.0': 'Vehicle Windows Float Processed',
    '4.0': 'Vehicle Windows Non Float Processed',
    '5.0': 'Containers',
    '6.0': 'Tableware',
    '7.0': 'Headlamps'
}

In [52]:
# Generating a short catalogue of classifers

classifiers = [
    'DecisionTreeClassifier',
    'SVM'
]

**Documentation** -- Python's Pickle module:  
*http://ataspinar.com/2017/05/26/classification-with-scikit-learn/*  
*https://stackoverflow.com/questions/48477949/not-able-to-pip-install-pickle-in-python-3-6/48477988*

In [54]:
# Import and test pickle (standard python module) to make sure troubleshooting solved

import pickle

intArray = [i for i in range(1,100)]
output = open('data.pkl', 'wb')
pickle.dump(intArray, output)
output.close()

--

In [56]:
# Checking for classifers -- if not in there, then it loops to create them

models = []

if os.path.isfile('pickle/SVM_classifier.pickle'):
    for name in classifiers:
        with open('pickle/SVM_classifier.pickle', 'rb') as ph:
            models.append((name, pickle.load(ph)))
else:
    models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
    models.append(('SVM', SVC()))
    models = prepare_training(models)

# inputs new data to test
ri = float(input("Enter Refractive Index: "))
na = float(input("Enter Sodium: "))
mg = float(input("Enter Magnesium: "))
al = float(input("Enter Aluminum: "))
si = float(input("Enter Silicon: "))
k = float(input("Enter Potassium: "))
ca = float(input("Enter Calcium: "))
ba = float(input("Enter Barium: "))
fe = float(input("Enter Iron: "))

# tests new data using the SVM classifier
new_data = np.array([ri, na, mg, al, si, k, ca, ba, fe])
prediction = models[3][1].predict(new_data.reshape(1, -1))

# outputs the type of glass
print('The type of glass is', glass_types[str(prediction[0])])

In [None]:
# Inputting the data output to test

ri = float(input("Enter Refractive Index: "))
na = float(input("Enter Sodium: "))
mg = float(input("Enter Magnesium: "))
al = float(input("Enter Aluminum: "))
si = float(input("Enter Silicon: "))
k = float(input("Enter Potassium: "))
ca = float(input("Enter Calcium: "))
ba = float(input("Enter Barium: "))
fe = float(input("Enter Iron: "))


In [22]:

# Testing new data using the SVM classifier
new_data = np.array([ri, na, mg, al, si, k, ca, ba, fe])
prediction = models[3][1].predict(new_data.reshape(1, -1))

# RESULT: Outputting the type of glass!
print('The type of glass is', glass_types[str(prediction[0])])

NameError: name 'models' is not defined