# MANE 4333
## Homework 8 Solutions
## Dr. Timmer

In [18]:
# Python cell to read data from file into dataframe df
import pandas as pd
import numpy as np
df=pd.read_csv('column_2C.DAT',delimiter=' ')
df.columns=['pIncidence','pTilt','llAngle','sSlope','pRadius','spon','classification']
df.head()
# create endogenous and exogenous variables
X = np.array(df.iloc[:, 0:6])
#print(X)
y = np.array(df['classification'])
#print(y)

In [19]:
# Problem 1
'''
Use the RandomizedSearchCV algorithm with the MLPClassifier 
to develop a model to classify the state of a person's spine. 
Display the parameters of the best MLPClassifier model, 
the accuracy of the best model for the training and test sets, 
and the confusion matrices for the training and testing data sets.
'''

print("Problem 1 Solutions")
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from scipy.stats import uniform
from sklearn.metrics import confusion_matrix

# suppress convergence warnings
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1114)


# Define the pipeline with MinMaxScaler and MLPClassifier
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('mlp', MLPClassifier(max_iter=100))
])

# Define parameter space for RandomizedSearchCV
param_dist = {
    'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'mlp__activation': ['tanh', 'relu'],
    'mlp__solver': ['sgd', 'adam'],
    'mlp__alpha': uniform(0.0001, 0.1),
    'mlp__learning_rate': ['constant', 'adaptive'],
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    verbose=2,
    random_state=1114,
)

# Fit the model
random_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)

# Test the best model on the test set
best_model = random_search.best_estimator_

# Generate predictions and confusion matrices for both training and test sets
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Compute and display confusion matrix for training set
train_cm = confusion_matrix(y_train, y_train_pred)
print("Confusion matrix for training set")
print(train_cm)
print("The accuracy for the training set is %f"%best_model.score(X_train,y_train))

# Compute and display confusion matrix for test set
test_cm = confusion_matrix(y_test, y_test_pred)
print("Confusion matrix for set set")
print(test_cm)
print("The accuracy for the test set is %f"%best_model.score(X_test,y_test))


Problem 1 Solutions
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END mlp__activation=relu, mlp__alpha=0.05318145368308734, mlp__hidden_layer_sizes=(100, 100), mlp__learning_rate=constant, mlp__solver=adam; total time=   0.1s
[CV] END mlp__activation=relu, mlp__alpha=0.05318145368308734, mlp__hidden_layer_sizes=(100, 100), mlp__learning_rate=constant, mlp__solver=adam; total time=   0.1s
[CV] END mlp__activation=relu, mlp__alpha=0.05318145368308734, mlp__hidden_layer_sizes=(100, 100), mlp__learning_rate=constant, mlp__solver=adam; total time=   0.1s
[CV] END mlp__activation=relu, mlp__alpha=0.05318145368308734, mlp__hidden_layer_sizes=(100, 100), mlp__learning_rate=constant, mlp__solver=adam; total time=   0.1s
[CV] END mlp__activation=relu, mlp__alpha=0.05318145368308734, mlp__hidden_layer_sizes=(100, 100), mlp__learning_rate=constant, mlp__solver=adam; total time=   0.1s
[CV] END mlp__activation=tanh, mlp__alpha=0.0752766502483093, mlp__hidden_layer_sizes=(50,), 

In [20]:
# Problem 2
'''
Fit a SVM Linear Classifier model to the data. 
Display the accuracy of the model for the training and 
test sets, and the confuion matrices for the training and 
testing data sets.
'''

print("Problem 2 Solution")

from sklearn.svm import LinearSVC

# Transform training data
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train_transformed=scaler.transform(X_train)
X_test_transformed=scaler.transform(X_test)

# Define the LinearSVC model
linear_svc = LinearSVC(C=1.0, max_iter=1000, random_state=42)

# Fit the model to the training data
linear_svc.fit(X_train, y_train)


#
y_train_pred = linear_svc.predict(X_train_transformed)
y_test_pred = linear_svc.predict(X_test_transformed)
# produce results
print("Confusion matrix for training set")
print(confusion_matrix(y_train,y_train_pred))
print("The accuracy for the training set is %f"%linear_svc.score(X_train_transformed,y_train))
print("Confusion matrix for test set")
print(confusion_matrix(y_test,y_test_pred))
print("The accuracy for the test set is %f"%linear_svc.score(X_test_transformed,y_test))


Problem 2 Solution
Confusion matrix for training set
[[171   0]
 [ 76   0]]
The accuracy for the training set is 0.692308
Confusion matrix for test set
[[38  0]
 [24  0]]
The accuracy for the test set is 0.612903


In [21]:
# Problem 3
'''
Use the GridSearchCV algorithm to find the best 
SVM Classifier with a RBF kernel. Display the parameters 
of the best MLPClassifier model, the accuracy of the 
best model for the training and test sets, and the 
confusion matrices for the training and testing data sets.
'''

print("Problem 3 Solution")

from sklearn.svm import SVC

# Define the SVM model with an RBF kernel
svm_rbf = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# Fit the model to the training data
svm_rbf.fit(X_train, y_train)
#
y_train_pred = svm_rbf.predict(X_train_transformed)
y_test_pred = svm_rbf.predict(X_test_transformed)
# produce results
print("Confusion matrix for training set")
print(confusion_matrix(y_train,y_train_pred))
print("The accuracy for the training set is %f"%svm_rbf.score(X_train_transformed,y_train))
print("Confusion matrix for test set")
print(confusion_matrix(y_test,y_test_pred))
print("The accuracy for the test set is %f"%svm_rbf.score(X_test_transformed,y_test))

Problem 3 Solution
Confusion matrix for training set
[[171   0]
 [ 76   0]]
The accuracy for the training set is 0.692308
Confusion matrix for test set
[[38  0]
 [24  0]]
The accuracy for the test set is 0.612903


## Problem 4

>State which model from Problems 1-3 provides the bet fit to the data and support your conclusion with facts from the three problems.

A summary of the results is provided in the table below.

| Model | Training Accuracy | Test Accuracy|
| --- | --- | --- |
| MLP Classifier with RandomizedSearchCV (Problem 2) | 0.878543 | 0.790323|
| SVM Linear Classifier (Problem 2) | 0.6923 | 0.6129|
| Grid Search SVM Classifier with RBF Kernel (Problem 3) | 0.6923 | 0.6129 |

Based upon the test accuracy, the best model is the MLP Classifier with RandomizedSearchCV.


