###### ### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2021 Semester 1

## Assignment 2: Duration Classification with Recipe Data


**Student ID(s): 1004503, 1005418

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
from collections import defaultdict
warnings.filterwarnings('ignore')

#testing

In [2]:
# Count Vectoriser process

n_vocab = pickle.load(open("ML2_data/recipe_text_features_countvec/train_name_countvectorizer.pkl", "rb"))
n_vocab_dict = n_vocab.vocabulary_

s_vocab = pickle.load(open("ML2_data/recipe_text_features_countvec/train_steps_countvectorizer.pkl", "rb"))
s_vocab_dict = s_vocab.vocabulary_

i_vocab = pickle.load(open("ML2_data/recipe_text_features_countvec/train_ingr_countvectorizer.pkl", "rb"))
i_vocab_dict = i_vocab.vocabulary_

train = pd.read_csv('ML2_data/recipe_train.csv')

X_train_raw = train.iloc[:,:-1]
y_train = train.iloc[:,-1]

X_test_raw = pd.read_csv('ML2_data/recipe_test.csv')


In [3]:
#fit countvec/doc2vec features onto text feature columns
#TODO: try doc2vec as well and compare performance

Xtrain_names = list(X_train_raw['name'])
Xtrain_ingredients = list(X_train_raw['ingredients'])
Xtrain_steps = list(X_train_raw['steps'])

Xtest_names = list(X_test_raw['name'])
Xtest_steps = list(X_test_raw['steps'])
Xtest_ingredients = list(X_test_raw['ingredients'])

#transforms each countvec into a dense matrix containing corresponding text features
Xtrain_names =  n_vocab.fit_transform(Xtrain_names).todense()
Xtest_names = n_vocab.transform(Xtest_names).todense()

Xtrain_steps =  s_vocab.fit_transform(Xtrain_steps).todense()
Xtest_steps = s_vocab.transform(Xtest_steps).todense()

Xtrain_ingredients =  i_vocab.fit_transform(Xtrain_ingredients).todense()
Xtest_ingredients = i_vocab.transform(Xtest_ingredients).todense()

In [4]:
# Switch columns with CountVec/Word2Vec fitted values

Xtrain = X_train_raw
Xtest = X_test_raw

Xtrain['name'] = Xtrain_names
Xtrain['steps'] = Xtrain_steps
Xtrain['ingredients'] = Xtrain_ingredients

Xtest['name'] = Xtest_names
Xtest['steps'] = Xtest_steps
Xtest['ingredients'] = Xtest_ingredients

Unnamed: 0,name,n_steps,n_ingredients,steps,ingredients
0,0,6,12,0,0
1,0,9,5,0,0
2,0,15,10,0,0
3,0,10,8,0,0
4,0,6,5,0,0
5,0,13,10,0,0
6,0,10,9,0,0
7,0,24,9,0,0
8,0,4,8,0,0
9,0,7,4,0,0


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn import svm
import time

#try different classifiers and evaluate individual model performance
#code adapated from Practical 8

models = [DummyClassifier(strategy='most_frequent'),
          LogisticRegression(),
          GaussianNB(),
          MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          KNeighborsClassifier()]

titles = ['0-R',
          'Logistic Regression',
          'GNB',
          'MNB',
          'LinearSVC',
          'Decision Tree',
          'KNN']

for title, model in zip(titles, models):
    
    output = []
    model.fit(Xtrain, y_train)
    y_predict = model.predict(Xtest)
    
    for i in range(len(y_predict)):
        row = [i+1, y_predict[i]]
        output.append(row)
    
    
    output = pd.DataFrame(output, columns = ['id', 'duration_label'])
    output.to_csv(title + '_CV.csv', index = False)
    


In [8]:
from sklearn.metrics import accuracy_score

#code excerpted from Practical 8
#creates a stacking classifier class given a set of classifiers and a meta-classifier
class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)

In [13]:
#initialise a meta classifier with Logistic Regression as the meta classifier

classifiers = [LogisticRegression(),
          GaussianNB(),
          MultinomialNB(),
          DecisionTreeClassifier(),
          KNeighborsClassifier()]

meta_clf_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_clf_lr)

#create stacked model output
output = []
stacker_lr.fit(Xtrain, y_train)
y_predict = stacker_lr.predict(Xtest)
    
for i in range(len(y_predict)):
    row = [i+1, y_predict[i]]
    output.append(row)
    
output = pd.DataFrame(output, columns = ['id', 'duration_label'])
output.to_csv('stacker' + '_CV.csv', index = False)