## CA2

sba23031

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical
import random

from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import GridSearchCV


2023-12-21 19:56:21.368726: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
df = pd.read_csv("glass_data.csv")

In [None]:
df.shape

Our dataset has 214 rows and 11 columns.

In [None]:
df.head(20)

## Exploratory Data Analysis

In [None]:
df.describe()

After reading the data and checking it loaded successfully the first thing to do is to call df.describe()

This function from the pandas library will give a comprehensive summary of the basic statistics of each column.

The values for ID don't really matter as it is just an identification column.

The "type" column also is pretty understandable as it is the classification of 1-7 with 4 missing.

The rest of the columns give more valuable information. The first thing that sticks out is that the columns "ba" and "fe" have a very low mean and a lot of 0.0 values. This could effect the data preparation and how the model is built. Those fields might could potentially not contribute much to the classification.

I don't have domain knowledge so maybe this is still valuable information.

In [None]:
correlation_matrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='plasma')
plt.title("Correlation Heatmap")
plt.show()


A correlation matrix will give an idea of how much one feature can effect the other.

From this there doesn't seem to be any great correlation to the glass type.

In [None]:
selected_columns = ['ri', 'na', 'mg', 'al', 'si', 'k', 'ca', 'ba', 'fe']

for feature in df.columns[:-1]:  # Exclude the target variable
    plt.figure(figsize=(8, 6))
    sns.boxplot(x='type', y=feature, data=df, palette='viridis')
    plt.title(f"Box Plot of {feature} by Glass Type")
    plt.show()

The box plots above show the distribution of each checmical with regards to the glass type. 

Again it shows some useful information in regards to "ba" and "fa". Iron seems to be very much in class types 1,2, and 3. While barium is mainly in type 7 but it has some outliers in regards 1 and 2.

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='type', data=df)

Next thing is to get an idea of the distribution of the target variable. The absence of glass type 4 made me suspicous there there could be unbalanced data. 

From the bar chart above we can see that the target variable leads more towards types 1 and 2. This imbalance needs to be sorted when we split the data for training the model.

## Data Cleaning

After exploring the data we now need to clean it before applying any model. 

The main thing is check if there are any null or missing values.

In [None]:
df.isna().sum()

Straight away this dataset has no null values. This would not usually be the case but at this is a small dataset containing mostly numerics it is not that ouf of the ordinary.

But since this is numerical we can plot how much of each variable has a value of 0.00. Like mentioned above we don't have domain knowledge of this area so the zero values can't be taken as invalid. They need to be kept into consideration for now.

In [None]:
zero_counts = (df == 0).sum()

plt.figure(figsize=(10, 6))
zero_counts.plot(kind='bar')
plt.title('Number of Zero Values per Column')
plt.ylabel('Number of Zero Values')
plt.xticks(rotation=45)
plt.show()

In [None]:
df.nunique()

The ID column is just a unique identifier for each row and is not necessary for this project. The valus are all too unique and don't provide anything for the prediction.

In [None]:
df.drop("id", inplace=True, axis=1)

In [None]:
df.shape

In [None]:
df.head()

### Model

In [None]:
X = df.drop(columns=['type'])
y = df['type'] - 1 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_train_one_hot = to_categorical(y_train, num_classes=)
y_test_one_hot = to_categorical(y_test, num_classes=7)

print(X_train_scaled.shape[1])

In [None]:
# Set random seed for Python
np.random.seed(42)

# Set random seed for TensorFlow
tf.random.set_seed(42)

random.seed(42)

In [None]:

from keras.callbacks import EarlyStopping
from keras import layers


model = Sequential()
model.add(Dense(9, activation='relu', input_shape=(X_train_scaled.shape[1],)))
# Hidden layers

# model.add(Dense(64, activation='relu'))

model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))


model.add(layers.Flatten())

# Output layer
model.add(Dense(7, activation='softmax'))

print(model.summary())

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train_scaled, y_train_one_hot, epochs=50, verbose=0, batch_size=20)

model.evaluate(X_test_scaled, y_test_one_hot)[1]


In [None]:
# Evaluate the model on the test set
y_pred = np.argmax(model.predict(X_test_scaled), axis=-1)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy}")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.show()

### Hyperparameter tuning

In [None]:
param_grid = {
    'hidden_layer_1_units': [32, 64, 128],
    'hidden_layer_2_units': [16, 32, 64],
    'batch_size': [10, 20, 32],
    'epochs': [50, 100, 150],
}

In [None]:
def build_model(hidden_layer_1_units, hidden_layer_2_units, batch_size, epochs):
    model = Sequential()
    model.add(Dense(hidden_layer_1_units, activation='relu', input_shape=(X_train_scaled.shape[1],)))
    model.add(Dense(hidden_layer_2_units, activation='relu'))
    model.add(Dense(6, activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

In [None]:
# keras_classifier = KerasClassifier(build_fn=build_model, verbose=0)
# grid_search = GridSearchCV(estimator=keras_classifier, param_grid=param_grid, cv=3)
# grid_result = grid_search.fit(X_train_scaled, y_train)


In [None]:
best_params = grid_result.best_params_
best_model = grid_result.best_estimator_.model

In [None]:
print(f"Best parameters: {best_params}")
print(f"Best model: {best_model}")
