In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import seaborn as sns

# Loading data

In [None]:
X = pd.read_csv(r'X.csv',header=None)
y = pd.read_csv(r'y.csv',header = None)

In [None]:
y.columns = ['Label']
data = pd.concat([X, y], axis=1)

#Mapping integers to strings
data['Label'] = data['Label'].map({0.0: 'book', 1.0: 'plastic case'})

In [None]:
data.head()

# Cleaning data

In [None]:
data.info()

In [None]:
data.isnull().values.any()

# Splitting data into testing and hold-out set

In [None]:
# Split data 75% training and 25% testing
data_training,data_final_test = train_test_split(data,test_size = 0.25,random_state = 42,stratify = data['Label'])
print("Length of training data " + str(len(data_training)))
print("Length of testing data " + str(len(data_final_test)))

# Plotting correlations

In [None]:
data_temp = data_training.copy()
data_temp['Label'] = data_temp['Label'].map({'book': 0, 'plastic case': 1})
data_temp.rename(columns={'Label': 768}, inplace=True)

corr_matrix = data_temp.corr(method='pearson')
corr_y = corr_matrix[768].sort_values(ascending = False)
plt.scatter(corr_y.index.values, corr_y.values)
plt.xlabel('Component')
plt.ylabel('Correlation')
plt.show()

In [None]:
corr_matrix = data_temp.corr(method='spearman')
corr_y = corr_matrix[768].sort_values(ascending = False)
plt.scatter(corr_y.index.values, corr_y.values)
plt.xlabel('Component')
plt.ylabel('Correlation')
plt.show()

# Part 1

# Model 1 : Logistic regression

In [None]:
X_train = data_training.drop(['Label'], axis=1)
y_train =data_training['Label']

In [None]:
pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(solver ='liblinear',penalty = 'l2', random_state=55))])

param_grid = dict(clf__C=list(np.power(10.0, np.arange(-5, 5))))

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=1,verbose =1)
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
grid.cv_results_

In [None]:
best_estimator = grid.best_estimator_
best_estimator_clf = best_estimator.named_steps['clf']
best_estimator_scalar = best_estimator.named_steps['scale']
best_estimator_clf.coef_[:,:20]

# Testing on hold out set

In [None]:
X_f = data_final_test.drop(['Label'], axis=1)
X_f = best_estimator_scalar.transform(X_f)
y_f =data_final_test['Label']
y_log_pred = best_estimator_clf.predict(X_f)

In [None]:
y_log_pred

In [None]:
y_f.values

In [None]:
print(classification_report(y_f, y_log_pred))

# Model 2 : Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
tre = DecisionTreeClassifier()
tre.fit(X_train, y_train)

export_graphviz(
            tre,
            out_file = 'test.dot',
            class_names =tre.classes_,
            rounded =True,
            filled = True)

In [None]:
data[data[702] <= 0.134]

In [None]:
data_training_plot = data_training['Label'].map({'book': 0.0, 'plastic case': 1.0})

In [None]:
plt.plot(data_training[702],data_training_plot, 'ro')
plt.xlabel('Predictor 702 value')
plt.ylabel('Label')
plt.title('Decision Tree boundary')
plt.show()

# Predicting unseen points

In [None]:
X = pd.read_csv(r'X.csv',header=None)
y = pd.read_csv(r'y.csv',header = None)
y[0] = y[0].map({0.0: 'book', 1.0: 'plastic case'})
X_Classify = pd.read_csv(r'XToClassify.csv',header = None)

tre = DecisionTreeClassifier()
tre.fit(X, y)

y_pred = tre.predict(X_Classify)
y_pred = pd.DataFrame({'y':y_pred})

data = pd.concat([X_Classify, y_pred], axis=1)

data.to_csv(r'Binary_Tree.csv')

# Part 2 Multiclass classification

In [None]:
X = pd.read_csv(r'X_multi.csv',header=None)
y = pd.read_csv(r'y_multi.csv',header = None)

y.columns = ['Label']
data = pd.concat([X, y], axis=1)
data['Label'] = data['Label'].map({0.0: 'air', 1.0: 'book',2.0: 'hand',
                                  3.0: 'knife',4.0: 'plastic case'})

In [None]:
data_training,data_final_test = train_test_split(data,test_size = 0.20,random_state = 42,stratify = y)
print(len(data_training))
print(len(data_final_test))

In [None]:
data_final_test.groupby('Label').count()

In [None]:
data_training.groupby('Label').count()

# Plotting correlations

In [None]:
data_temp = data_training.copy()
data_temp['Label'] = data_temp['Label'].map({'air': 0, 'book': 1,'hand': 2,
                                  'knife': 3,'plastic case': 4})
data_temp.rename(columns={'Label': 768}, inplace=True)
corr_matrix = data_temp.corr(method='pearson')
corr_y = corr_matrix[768].sort_values(ascending = False)
plt.scatter(corr_y.index.values, corr_y.values)
plt.xlabel('Component')
plt.ylabel('Correlation')
plt.show()

In [None]:
data_temp = data_training.copy()
data_temp['Label'] = data_temp['Label'].map({'air': 0, 'book': 1,'hand': 2,
                                  'knife': 3,'plastic case': 4})
data_temp.rename(columns={'Label': 768}, inplace=True)

corr_matrix = data_temp.corr(method='spearman')
corr_y = corr_matrix[768].sort_values(ascending = False)
plt.scatter(corr_y.index.values, corr_y.values)
plt.xlabel('Component')
plt.ylabel('Correlation')
plt.show()

# Training model

# Model 1 Logistic regression

In [None]:
X_train = data_training.drop(['Label'], axis=1)
y_train =data_training['Label']

In [None]:
pipe = Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(solver ='newton-cg',penalty = 'l2', multi_class = 'multinomial',random_state=15))])

param_grid = dict(clf__C=list(np.power(10.0, np.arange(-5, 5))))

grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=1,verbose =1)
grid.fit(X_train, y_train)

In [None]:
best_estimator = grid.best_estimator_

In [None]:
grid.cv_results_

In [None]:
best_estimator_clf = best_estimator.named_steps['clf']
best_estimator_scale = best_estimator.named_steps['scale']
best_estimator_clf.coef_

# Hold out set testing

In [None]:
X_f = data_final_test.drop(['Label'], axis=1)
X_f =best_estimator_scale.transform(X_f)
y_f =data_final_test['Label']

In [None]:
y_pred = best_estimator_clf.predict(X_f)

In [None]:
y_pred

In [None]:
y_f.values

In [None]:
confusion_matrix(y_f,y_pred)

In [None]:
sns.heatmap(confusion_matrix(y_f,y_pred), 
            xticklabels=grid.classes_,
            yticklabels=grid.classes_)
plt.show()

In [None]:
print(classification_report(y_f, y_pred))

# Model 2 Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
tre = DecisionTreeClassifier()
tre.fit(X_train, y_train)

export_graphviz(
            tre,
            out_file = 'test1.dot',
            class_names =tre.classes_,
            rounded =True,
            filled = True)

In [None]:
X_f = data_final_test.drop(['Label'], axis=1)
y_f =data_final_test['Label']

In [None]:
y_pred_tree = tre.predict(X_f)

In [None]:
y_pred_tree

In [None]:
y_f.values

In [None]:
sns.heatmap(confusion_matrix(y_f,y_pred_tree), 
            xticklabels=tre.classes_,
            yticklabels=tre.classes_)
plt.show()

In [None]:
len(data[data[153] <= -0.39])

# Model 3 SVM

In [None]:
steps = [('scaler', StandardScaler()), ('SVM', LinearSVC(max_iter = 1000000))]
pipeline = Pipeline(steps)

tuned_parameters = [{'SVM__C': [0.001,0.01,0.1,1, 10, 100, 1000]}]

grid = GridSearchCV(pipeline, param_grid=tuned_parameters, cv=5)
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
result_df = pd.DataFrame.from_dict(grid.cv_results_)

In [None]:
result_df[['rank_test_score','param_SVM__C','mean_test_score']]

In [None]:
best_estimator_svm = grid.best_estimator_
best_estimator_svm_clf = best_estimator_svm.named_steps['SVM']
best_estimator_svm_scale = best_estimator_svm.named_steps['scaler']

# Hold out set testing

In [None]:
X_f = data_final_test.drop(['Label'], axis=1)
X_f = best_estimator_svm_scale.transform(X_f)
y_f =data_final_test['Label']

In [None]:
y_pred_SVM = best_estimator_svm_clf.predict(X_f)

In [None]:
y_pred_SVM

In [None]:
y_f.values

In [None]:
confusion_matrix(y_pred_SVM,y_f)

In [None]:
sns.heatmap(confusion_matrix(y_f,y_pred_SVM), 
            xticklabels=grid.classes_,
            yticklabels=grid.classes_)
plt.show()

In [None]:
print(classification_report(y_f, y_pred_SVM))

# Predict final points

In [None]:
X = pd.read_csv(r'X_multi.csv',header=None)
y = pd.read_csv(r'y_multi.csv',header = None)
y[0] = y[0].map({0.0: 'air', 1.0: 'book',2.0: 'hand',
                                  3.0: 'knife',4.0: 'plastic case'})

X_to_classfiy = pd.read_csv(r'XToClassify_multi.csv',header=None)

linear_kernel_svm_clf = Pipeline([
        ('scaler',StandardScaler()),
        ('smv_clf',LinearSVC(C =0.1,max_iter = 1000000))
])
linear_kernel_svm_clf.fit(X.values,y[0])

y_pred = linear_kernel_svm_clf.predict(X_to_classfiy)
y_pred = pd.DataFrame({'y':y_pred})

data = pd.concat([X_to_classfiy, y_pred], axis=1)

data.to_csv(r'Multi_SVM.csv')