In [14]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, classification_report, confusion_matrix

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'Iris.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
iris = pd.read_csv(file_content_stream)
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [15]:
## creating frequency table of Species
iris['Species'].value_counts() / iris.shape[0]

Iris-setosa        0.333333
Iris-versicolor    0.333333
Iris-virginica     0.333333
Name: Species, dtype: float64

In [16]:
iris['Species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

The distribution of the target column is evenly balanced. No need to stratify the target column when splitting the data.

In [17]:
## changing labels to numbers
iris['Species_numb'] = np.where(iris['Species'] == 'Iris-virginica', 1,
                                     np.where(iris['Species']== 'Iris-versicolor', 2, 3))

In [18]:
iris

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,Species_numb
0,1,5.1,3.5,1.4,0.2,Iris-setosa,3
1,2,4.9,3.0,1.4,0.2,Iris-setosa,3
2,3,4.7,3.2,1.3,0.2,Iris-setosa,3
3,4,4.6,3.1,1.5,0.2,Iris-setosa,3
4,5,5.0,3.6,1.4,0.2,Iris-setosa,3
...,...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica,1
146,147,6.3,2.5,5.0,1.9,Iris-virginica,1
147,148,6.5,3.0,5.2,2.0,Iris-virginica,1
148,149,6.2,3.4,5.4,2.3,Iris-virginica,1


In [19]:
## defining inputs and target variables
x = iris.drop(columns = ['Id', 'Species', 'Species_numb'], axis = 1)
y = iris['Species_numb']

## splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [20]:
## defining the scaler
scaler = MinMaxScaler()

## scaling the data
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [24]:
## building the models using ONE VS REST
one_vs_all_rf = OneVsRestClassifier(estimator = RandomForestClassifier(n_estimators = 500, max_depth = 3)).fit(x_train, y_train)
one_vs_all_svc = OneVsRestClassifier(estimator = SVC(kernel = 'rbf', C = 0.1, probability = True)).fit(x_train, y_train)
                                    
## predicting on the testing dataset
one_vs_all_rf_preds = one_vs_all_rf.predict_proba(x_test)
one_vs_all_svc_preds = one_vs_all_svc.predict_proba(x_test)

## changing likelihoods to labels
one_vs_all_rf_preds = np.argmax(one_vs_all_rf_preds, axis = 1) + 1
one_vs_all_svc_preds = np.argmax(one_vs_all_svc_preds, axis = 1) + 1

## confusion matrix
print('RF:', confusion_matrix(y_test, one_vs_all_rf_preds))
print('SVC:', confusion_matrix(y_test, one_vs_all_svc_preds))

## classification report
print('RF:', classification_report(y_test, one_vs_all_rf_preds))
print('SVC:', classification_report(y_test, one_vs_all_svc_preds))

RF: [[ 9  1  0]
 [ 0 10  0]
 [ 0  0 10]]
SVC: [[ 9  1  0]
 [ 0 10  0]
 [ 0  0 10]]
RF:               precision    recall  f1-score   support

           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10
           3       1.00      1.00      1.00        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30

SVC:               precision    recall  f1-score   support

           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10
           3       1.00      1.00      1.00        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



Based on these results, both of these models have the same performance.