In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, confusion_matrix

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'Iris.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the datafile
iris = pd.read_csv(file_content_stream)
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [2]:
#frequency table of species
iris['Species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [3]:
#changing labels to numbers
iris['Species_num']=np.where(iris['Species']=='Iris-virginica',1,np.where(iris['Species']=='Iris-versicolor',2,3))

In [4]:
#defining inputs
x=iris.drop(columns=['Id','Species','Species_num'])
y=iris['Species_num']

#splitting the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = .2,stratify=y)

In [5]:
#scaling the data
scaler=MinMaxScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

**Random Forest**

In [9]:
one_vs_all_rf = OneVsRestClassifier(estimator = RandomForestClassifier(n_estimators=500,max_depth=3)).fit(x_train, y_train)

#predicting on test
one_vs_all_rf_pred = one_vs_all_rf.predict_proba(x_test)
one_vs_all_rf_pred = np.argmax(one_vs_all_rf_pred, axis=1) +1

print(confusion_matrix(y_test,one_vs_all_rf_pred))
print(classification_report(y_test,one_vs_all_rf_pred))

[[10  0  0]
 [ 7  3  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       0.59      1.00      0.74        10
           2       1.00      0.30      0.46        10
           3       1.00      1.00      1.00        10

    accuracy                           0.77        30
   macro avg       0.86      0.77      0.73        30
weighted avg       0.86      0.77      0.73        30



**Support Vector Machine**

In [11]:
one_vs_all_svm = OneVsRestClassifier(estimator = SVC(kernel='rbf',C=0.1,probability=True)).fit(x_train, y_train)

#predicting on test
one_vs_all_svm_pred = one_vs_all_svm.predict_proba(x_test)
one_vs_all_svm_pred = np.argmax(one_vs_all_svm_pred, axis=1) +1

print(confusion_matrix(y_test,one_vs_all_svm_pred))
print(classification_report(y_test,one_vs_all_svm_pred))

[[10  0  0]
 [ 6  4  0]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       0.62      1.00      0.77        10
           2       1.00      0.40      0.57        10
           3       1.00      1.00      1.00        10

    accuracy                           0.80        30
   macro avg       0.88      0.80      0.78        30
weighted avg       0.88      0.80      0.78        30



Based on my results, SVM performed better in classifying Iris Species.