In [1]:
# Get access to the iris dataset

import numpy as np
from sklearn import datasets
iris = datasets.load_iris()

# Get the X and y
X = iris.data
y = iris.target

In [19]:
# Q1. Zero-R is the baseline while returning the best accuracy from a 10 Fold cross validation

# Import everything needed
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression

def ave_cross_val_score(classifier, x, y, n):
    sum = 0
    scores = cross_val_score(classifier, x, y, cv=n)
    for i in range(n):
        sum += scores[i]
    return sum/n

def run_bake_off(X, y, n_folds):
    # Baseline
    zero_r = DummyClassifier(strategy='most_frequent')
    print("0-R Score = " + str(ave_cross_val_score(zero_r, X, y, n_folds)))

    # Q1i).
    mnb = MultinomialNB()
    print("MultinomialNB Score = " + str(ave_cross_val_score(mnb, X, y, n_folds)))

    # Q1ii).
    gnb = GaussianNB()
    print("GaussianNB Score = " + str(ave_cross_val_score(gnb, X, y, n_folds)))

    # Q1iii).
    lin_svc = LinearSVC()
    print("LinearSVC Score = " + str(ave_cross_val_score(lin_svc, X, y, n_folds)))

    # Q1iv).
    svc = SVC()
    print("SVC Score = " + str(ave_cross_val_score(svc, X, y, n_folds)))

    # Q1v).
    lgr = LogisticRegression()
    print("Logistic Regression Score = " + str(ave_cross_val_score(lgr, X, y, n_folds)))
    
    return

run_bake_off(X, y, 10)


0-R Score = 0.6535799111906648
MultinomialNB Score = 0.6535799111906648
GaussianNB Score = 0.6767787683728615
LinearSVC Score = 0.7744707583215724
SVC Score = 0.7524462726469542
Logistic Regression Score = 0.7648967907014101


Q1. The SVC classifier has the best accuracy in its default configuration.

In [26]:
def convert_class(raw):
    if int(raw)<=10: 
        return 0
    else: 
        return 1

# Open up the Abalone dataset

f = open('abalone.csv', 'r')
X = []
y = []

for line in f:
    atts = line[:-1].split(",")
    X.append(atts[1:-1])
    y.append(convert_class(atts[-1]))

f.close()

# Convert to numpy array
X = np.array(X).astype(np.float)

In [27]:
# Q2a).

run_bake_off(X, y, 10)

# Now the linear SVC wins the bakeoff slightly beating logisitic regression. This might be because it has been
# setup as a 2-class problem and thus a linear SVM would perform nice on these

0-R Score = 0.6535799111906648
MultinomialNB Score = 0.6535799111906648
GaussianNB Score = 0.6767787683728615
LinearSVC Score = 0.7744707583215724
SVC Score = 0.7524462726469542
Logistic Regression Score = 0.7648967907014101


In [31]:
# Q2b). 

def convert_class_thrice(raw):
    if int(raw) <= 8:
        return 0
    elif 9 <= int(raw) <= 10:
        return 1
    else:
        return 2

f = open('abalone.csv', 'r') 
X = []
y = []

for line in f:
    atts = line[:-1].split(",")
    X.append(atts[1:-1])
    y.append(convert_class_thrice(atts[-1]))
    
# Convert to numpy array
X = np.array(X).astype(np.float)

run_bake_off(X, y, 10)

# Linear SVC still wins. Dunno why.

0-R Score = 0.3464207504591879
MultinomialNB Score = 0.5570936039372009
GaussianNB Score = 0.5772604926947189
LinearSVC Score = 0.644483665671723
SVC Score = 0.6236740758927042
Logistic Regression Score = 0.6320370619049335


In [35]:
# Q2c.

f = open('abalone.csv', 'r') 
X = []
y = []

for line in f:
    atts = line[:-1].split(",")
    X.append(atts[1:-1])
    y.append(int(atts[-1]))
    
# Convert to numpy array
X = np.array(X).astype(np.float)

run_bake_off(X, y, 10)

# Linear SVC is still winning for some reason.



0-R Score = 0.16500855703117906
MultinomialNB Score = 0.16525547061142595
GaussianNB Score = 0.23591700319832828
LinearSVC Score = 0.2583687378399511




SVC Score = 0.24785709729228578




Logistic Regression Score = 0.24687601563564093


In [39]:
# Q2d.)

from sklearn.preprocessing import OneHotEncoder

f = open('abalone.csv', 'r') 
X = []
y = []

def convert_gender(raw):
    if raw=="M": 
        return 0
    elif raw=="I": 
        return 1
    elif raw=="F": 
        return 2
    else: 
        return 3
    
for line in f:
    atts = line[:-1].split(",")
    atts[0]=convert_gender(atts[0])
    X.append(atts[0:-1])
    y.append(int(atts[-1]))

ohe = OneHotEncoder(categorical_features=[0])
X = ohe.fit_transform(X).toarray()

run_bake_off(X, y, 10)

# Logistic regression, LinearSVC and SVC are pretty resilient to the addition of these 3 attributes. I 
# don't really know why that would be.



0-R Score = 0.16500855703117906
MultinomialNB Score = 0.22008672943989835
GaussianNB Score = 0.09995373301345349
LinearSVC Score = 0.24991067442390813




SVC Score = 0.2348866302578127




Logistic Regression Score = 0.24658107691038467


In [None]:
# Q3).
