#### Importing required libraries

In [11]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

#### Reading the datasets (Train, Test and Meta)

In [2]:
base_path = 'C:/Users/gmadh/Desktop/GaTech/SUMMER 2021/ISYE 6740 Machine Learning/Project'

## Reading the meta dataset
meta_data = pd.read_csv("Data/Meta.csv")
## Getting the number of classes from the Meta dataset
classes = meta_data.shape[0]

## Loading the sample image for each class from Meta dataset
meta_img = []
meta_class = []
meta_path = f'{base_path}/Data/Meta/'
meta_files = os.listdir(meta_path)
for file in meta_files:
    '''
    Note: I am not resizing these images as I will use them for EDA purposes only.
    '''
    image = cv2.imread(meta_path+file)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    meta_img.append(image)
    meta_class.append(file.split('.')[0])
    
## Reading the Train data set (consists of images in multiple folders).This piece of code will iteratively read images from every folder.
'''
Two lists are populated. The raw training images are of different resolutions. They are loaded as such in train_data_raw. 
For purpose of modeling, all images are resized to a common resolution (30x30) and loaded into train_data
'''

train_data_raw = []
train_data=[]
train_labels=[]

res = 30

for c in range(classes) :
    path = f'{base_path}/Data/Train/{c}/'.format(c)
    files = os.listdir(path)
    for file in files:
        train_image = cv2.imread(path+file)
        train_image = cv2.cvtColor(train_image, cv2.COLOR_BGR2RGB)
        image_resized = cv2.resize(train_image, (res, res), interpolation = cv2.INTER_AREA)
        train_data.append(np.array(image_resized))
        train_data_raw.append(train_image)
        train_labels.append(c)
        
## Reading the Test data images 
test_csv = pd.read_csv(f'{base_path}/Data/Test.csv')
test_img_path = test_csv['Path']

## List containing class labels for test data
test_labels = test_csv['ClassId'].values
test_data = []  ## List to hold resized test images
test_data_raw = []  ## List to hold test images in raw format

for f in test_img_path:
    test_image = cv2.imread(f'{base_path}/Data/' + f)
    test_image = cv2.cvtColor(test_image, cv2.COLOR_BGR2RGB)
    image_resized = cv2.resize(test_image, (res, res), interpolation = cv2.INTER_AREA)
    test_data.append(np.array(image_resized))
    test_data_raw.append(test_image)
    
    #from PIL import Image
    #image_from_array = Image.fromarray(image, 'RGB')
    #resized_image = np.array(image_from_array.resize((30, 30)))

#### Grouping the classes into 6 categories from 43 categories

In [3]:
## Creating groups of classes and getting indexes of group elements from test and train datatsets.
groups = {'speed':[0,1,2,3,4,5,7,8], 'prohibitory':[9,10,15,16], 'derestriction':[6,32,41,42], 'mandatory':[33,34,35,36,37,38,39,40], 'danger':[11,18,19,20,21,22,23,24,25,26,27,28,29,30,31], 'other':[12,14,13,17]}
group_labels = {'speed':1, 'prohibitory':2, 'derestriction':3, 'mandatory':4, 'danger':5, 'other':6}

group_indices_train = {}
group_indices_test = {}

for k in groups.keys():
    group_indices_train[k] = np.where(np.isin(np.array(train_labels), groups[k]) == True)[0]
    group_indices_test[k] = np.where(np.isin(np.array(test_labels), groups[k]) == True)[0]
    
## Creating new labels for train and test datasets
train_lbl_grp = np.array(train_labels)
test_lbl_grp = np.array(test_labels)

## Updating new label values for each group
for k,v in group_labels.items():  ##New group labels
    train_lbl_grp[group_indices_train[k]] = v
    test_lbl_grp[group_indices_test[k]] = v

### Data Pre-processing

#### Applying below pre-processing techniques

1. Resizing the data to 30 * 30 * 3 dimensions to have a common resolution for modeling (Already done and loaded in train_data and test_data)
2. Scaling the data by diving by 255. 
3. Resampling the data to level the class imbalance.

### Modeling using all 2700 pixels (common resolution) including RGB (6 classes) - Down Sampling

In [29]:
## Scaling the train data
train_arr = np.array(train_data)
train_arr = train_arr.reshape((train_arr.shape[0], 30*30*3))
train_data_scaled = train_arr.astype(float)/255

## Scaling the test data
test_arr = np.array(test_data)
test_arr = test_arr.reshape((test_arr.shape[0], 30*30*3))
test_data_scaled = test_arr.astype(float)/255

## Labels : train_lbl_grp, test_lbl_grp

#### Down Sampling the data to balance classes

In [15]:
rus = RandomUnderSampler(random_state=0)
X_Sampled, y_Sampled = rus.fit_resample(train_data_scaled, train_lbl_grp)

## Splitting the data into test/train for modeling  (80/20)
X_train, X_test, y_train, y_test = train_test_split(X_Sampled,y_Sampled, test_size=0.2, random_state = 24)

In [8]:
## Cross Validation to Tune / Identify best hyperparameters for LDA model
model = LinearDiscriminantAnalysis()

grid = dict()
grid['solver'] = ['svd', 'lsqr', 'eigen']
grid['n_components'] = np.arange(5,20,1)
search = GridSearchCV(model, grid, scoring='accuracy', cv=5, n_jobs=-1)
results = search.fit(X_train, y_train)

print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
#scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

Mean Accuracy: 0.717
Config: {'n_components': 5, 'solver': 'svd'}


In [16]:
## Fitting model after cross-validation

model = LinearDiscriminantAnalysis(n_components=5,solver = 'svd')
model.fit(X_train,y_train)

LinearDiscriminantAnalysis(n_components=5)

#### Validation Data accuracy

In [17]:
val_pred = model.predict(X_test)
print("Vaidation accuracy for LDA model is : ", accuracy_score(y_test,val_pred))

Vaidation accuracy for LDA model is :  0.7894736842105263


#### Test Data Accuracy

In [20]:
pred = model.predict(test_data_scaled)

print("Test accuracy for LDA model is : ", accuracy_score(test_lbl_grp,pred))
print(confusion_matrix(test_lbl_grp, pred))
print(classification_report(test_lbl_grp, pred))

Test accuracy for LDA model is :  0.7650039588281868
[[3388  308  184  109   91   90]
 [ 122 1219   47   41   32   39]
 [  15   15  321    8    0    1]
 [ 127  119   84 1296   64   80]
 [ 217  180  145   97 2048  103]
 [ 140  162  140  120   88 1390]]
              precision    recall  f1-score   support

           1       0.85      0.81      0.83      4170
           2       0.61      0.81      0.70      1500
           3       0.35      0.89      0.50       360
           4       0.78      0.73      0.75      1770
           5       0.88      0.73      0.80      2790
           6       0.82      0.68      0.74      2040

    accuracy                           0.77     12630
   macro avg       0.71      0.78      0.72     12630
weighted avg       0.80      0.77      0.77     12630



### Modeling using all 2700 pixels (common resolution) including RGB (6 classes) - No Sampling

In [44]:
## Scaling the train data
train_arr = np.array(train_data)
train_arr = train_arr.reshape((train_arr.shape[0], 30*30*3))
train_data_scaled = train_arr.astype(float)/255

## Scaling the test data
test_arr = np.array(test_data)
test_arr = test_arr.reshape((test_arr.shape[0], 30*30*3))
test_data_scaled = test_arr.astype(float)/255

## Labels : train_lbl_grp, test_lbl_grp

In [45]:
## Splitting the data into test/train for modeling  (80/20)
X_train, X_test, y_train, y_test = train_test_split(train_data_scaled,train_lbl_grp, test_size=0.2, random_state = 24)

In [34]:
## Cross Validation to Tune / Identify best hyperparameters for LDA model
model = LinearDiscriminantAnalysis()

grid = dict()
grid['solver'] = ['svd', 'lsqr', 'eigen']
grid['n_components'] = np.arange(5,20,1)
search = GridSearchCV(model, grid, scoring='accuracy', cv=5, n_jobs=-1)
results = search.fit(X_train, y_train)

print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
#scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

Mean Accuracy: 0.895
Config: {'n_components': 5, 'solver': 'svd'}


In [46]:
## Fitting model after cross-validation
model = LinearDiscriminantAnalysis(n_components=5,solver = 'svd')
model.fit(X_train,y_train)

LinearDiscriminantAnalysis(n_components=5)

In [47]:
val_pred = model.predict(X_test)
print("Vaidation accuracy for LDA model is : ", accuracy_score(y_test,val_pred))

Vaidation accuracy for LDA model is :  0.8908441724049987


In [48]:
pred = model.predict(test_data_scaled)

print("Test accuracy for LDA model is : ", accuracy_score(test_lbl_grp,pred))
print(confusion_matrix(test_lbl_grp, pred))
print(classification_report(test_lbl_grp, pred))

Test accuracy for LDA model is :  0.866825019794141
[[4083   50   15    5   11    6]
 [ 317 1164    0   12    7    0]
 [ 125   11  223    1    0    0]
 [ 284   10    6 1457    7    6]
 [ 335   32   16    5 2363   39]
 [ 268   34   30   31   19 1658]]
              precision    recall  f1-score   support

           1       0.75      0.98      0.85      4170
           2       0.89      0.78      0.83      1500
           3       0.77      0.62      0.69       360
           4       0.96      0.82      0.89      1770
           5       0.98      0.85      0.91      2790
           6       0.97      0.81      0.88      2040

    accuracy                           0.87     12630
   macro avg       0.89      0.81      0.84     12630
weighted avg       0.89      0.87      0.87     12630



In [50]:
len(train_data_scaled)

39209

### Model efficiency using Gray scale image resolution (6 classes) - Down Sampling

In [31]:
## Scaling the train data
train_arr = np.array(train_data)
train_arr = np.mean(train_arr, -1)
train_arr = train_arr.reshape((train_arr.shape[0], 30*30))
train_data_scaled = train_arr.astype(float)/255

## Scaling the test data
test_arr = np.array(test_data)
test_arr = np.mean(test_arr, -1)
test_arr = test_arr.reshape((test_arr.shape[0], 30*30))
test_data_scaled = test_arr.astype(float)/255

#### Down Sampling data and checking efficiency

In [None]:
## Labels : train_lbl_grp, test_lbl_grp
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_Sampled, y_Sampled = rus.fit_resample(train_data_scaled, train_lbl_grp)

## Splitting into test/validation
X_train, X_test, y_train, y_test = train_test_split(X_Sampled,y_Sampled, test_size=0.2, random_state = 24)

In [22]:
## Fitting an LDA on dataset after PCA with default parameters to understand fit
model = LinearDiscriminantAnalysis()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

In [23]:
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean Accuracy: 0.811 (0.017)


In [25]:
#### Cross validation to get best hyper parameters
model = LinearDiscriminantAnalysis()
# define grid
grid = dict()
grid['solver'] = ['svd', 'lsqr', 'eigen']
grid['n_components'] = np.arange(5,20,1)

# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=5, n_jobs=-1)
# perform the search
results = search.fit(X_train, y_train)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

Mean Accuracy: 0.797
Config: {'n_components': 5, 'solver': 'svd'}


In [26]:
## Fitting model with best hyper-parameters from CV

model = LinearDiscriminantAnalysis(n_components=5,solver = 'svd')
model.fit(X_train,y_train)

LinearDiscriminantAnalysis(n_components=5)

#### Validation Accuracy

In [27]:
val_pred = model.predict(X_test)
print("Vaidation accuracy for LDA model is : ", accuracy_score(y_test,val_pred))

Vaidation accuracy for LDA model is :  0.8121345029239766


#### Test Accuracy

In [28]:
pred = model.predict(test_data_scaled)

print("Test accuracy for LDA model is : ", accuracy_score(test_lbl_grp,pred))
print(confusion_matrix(test_lbl_grp, pred))
print(classification_report(test_lbl_grp, pred))

Test accuracy for LDA model is :  0.7673792557403009
[[3479  224  133  271   10   53]
 [  29 1302   13  128    7   21]
 [  29   25  297    0    0    9]
 [  83  230   28 1177   33  219]
 [ 153  185   43  266 2062   81]
 [  49  155   44  392   25 1375]]
              precision    recall  f1-score   support

           1       0.91      0.83      0.87      4170
           2       0.61      0.87      0.72      1500
           3       0.53      0.82      0.65       360
           4       0.53      0.66      0.59      1770
           5       0.96      0.74      0.84      2790
           6       0.78      0.67      0.72      2040

    accuracy                           0.77     12630
   macro avg       0.72      0.77      0.73     12630
weighted avg       0.80      0.77      0.78     12630



### Model efficiency using Gray scale image resolution (6 classes) - No Sampling

In [38]:
## Scaling the train data
train_arr = np.array(train_data)
train_arr = np.mean(train_arr, -1)
train_arr = train_arr.reshape((train_arr.shape[0], 30*30))
train_data_scaled = train_arr.astype(float)/255

## Scaling the test data
test_arr = np.array(test_data)
test_arr = np.mean(test_arr, -1)
test_arr = test_arr.reshape((test_arr.shape[0], 30*30))
test_data_scaled = test_arr.astype(float)/255

In [39]:
## Splitting into test/validation
X_train, X_test, y_train, y_test = train_test_split(train_data_scaled,train_lbl_grp, test_size=0.2, random_state = 24)

In [40]:
#### Cross validation to get best hyper parameters
model = LinearDiscriminantAnalysis()
# define grid
grid = dict()
grid['solver'] = ['svd', 'lsqr', 'eigen']
grid['n_components'] = np.arange(5,20,1)

# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=5, n_jobs=-1)
# perform the search
results = search.fit(X_train, y_train)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

Mean Accuracy: 0.829
Config: {'n_components': 5, 'solver': 'svd'}


In [41]:
## Fitting model with best hyper-parameters from CV

model = LinearDiscriminantAnalysis(n_components=5,solver = 'svd')
model.fit(X_train,y_train)

LinearDiscriminantAnalysis(n_components=5)

In [42]:
val_pred = model.predict(X_test)
print("Vaidation accuracy for LDA model is : ", accuracy_score(y_test,val_pred))

Vaidation accuracy for LDA model is :  0.8291252231573578


In [43]:
pred = model.predict(test_data_scaled)

print("Test accuracy for LDA model is : ", accuracy_score(test_lbl_grp,pred))
print(confusion_matrix(test_lbl_grp, pred))
print(classification_report(test_lbl_grp, pred))

Test accuracy for LDA model is :  0.7996041171813143
[[3964   39   44  101    8   14]
 [ 288 1089    0   93   10   20]
 [ 139   18  189    0    0   14]
 [ 314   65   13 1218   33  127]
 [ 388   37   13   89 2186   77]
 [ 196   54   20  292   25 1453]]
              precision    recall  f1-score   support

           1       0.75      0.95      0.84      4170
           2       0.84      0.73      0.78      1500
           3       0.68      0.53      0.59       360
           4       0.68      0.69      0.68      1770
           5       0.97      0.78      0.87      2790
           6       0.85      0.71      0.78      2040

    accuracy                           0.80     12630
   macro avg       0.79      0.73      0.76     12630
weighted avg       0.81      0.80      0.80     12630

