###### ### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2021 Semester 1

## Assignment 2: Duration Classification with Recipe Data


**Student ID(s): 1004503, 1005418

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
from collections import defaultdict
warnings.filterwarnings('ignore')

#testing

In [23]:
# Doc2Vec process

#train doc2vec features
n_train_doc2vec = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/train_name_doc2vec100.csv")
s_train_doc2vec = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/train_steps_doc2vec100.csv")
i_train_doc2vec = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/train_ingr_doc2vec100.csv")

#test doc2vec features
n_test_doc2vec = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/test_name_doc2vec100.csv")
s_test_doc2vec = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/test_steps_doc2vec100.csv")
i_test_doc2vec = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/test_ingr_doc2vec100.csv")

#read raw test and train data
train = pd.read_csv('ML2_data/recipe_train.csv')

X_train_raw = train.iloc[:,:-1]
y_train = train.iloc[:,-1]

X_test_raw = pd.read_csv('ML2_data/recipe_test.csv')

In [24]:
#create 2d arrays of doc2vec dataframes
n_train_doc2vec = n_train_doc2vec.iloc[1:, :].to_numpy()
s_train_doc2vec = s_train_doc2vec.iloc[1:, :].to_numpy() 
i_train_doc2vec = i_train_doc2vec.iloc[1:, :].to_numpy() 

n_test_doc2vec = n_test_doc2vec.iloc[1:, :].to_numpy() 
s_test_doc2vec = s_test_doc2vec.iloc[1:, :].to_numpy()
i_test_doc2vec = i_test_doc2vec.iloc[1:, :].to_numpy()

In [25]:
#fit doc2vec features onto text feature columns

Xtrain = X_train_raw
Xtest = X_test_raw

Xtrain['name'] = n_train_doc2vec
Xtrain['steps'] = s_train_doc2vec
Xtrain['ingredients'] = i_train_doc2vec

Xtest['name'] = n_test_doc2vec
Xtest['steps'] = s_test_doc2vec
Xtest['ingredients'] = i_test_doc2vec

print(n_train_doc2vec.iloc[0, :10])

ValueError: Length of values does not match length of index

In [16]:
Xtrain['name'].head(10)

0    0.050503
1    0.017546
2   -0.192465
3    0.294862
4   -0.552746
5    0.027772
6   -0.009359
7    0.331828
8    0.025517
9    0.187977
Name: name, dtype: float64

In [4]:
# Switch columns with CountVec/Word2Vec fitted values

Xtrain = X_train_raw
Xtest = X_test_raw

Xtrain['name'] = Xtrain_names
Xtrain['steps'] = Xtrain_steps
Xtrain['ingredients'] = Xtrain_ingredients

Xtest['name'] = Xtest_names
Xtest['steps'] = Xtest_steps
Xtest['ingredients'] = Xtest_ingredients

Unnamed: 0,name,n_steps,n_ingredients,steps,ingredients
0,0,6,12,0,0
1,0,9,5,0,0
2,0,15,10,0,0
3,0,10,8,0,0
4,0,6,5,0,0
5,0,13,10,0,0
6,0,10,9,0,0
7,0,24,9,0,0
8,0,4,8,0,0
9,0,7,4,0,0


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn import svm
import time

#try different classifiers and evaluate individual model performance
#code adapated from Practical 8

models = [DummyClassifier(strategy='most_frequent'),
          LogisticRegression(),
          GaussianNB(),
          MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          KNeighborsClassifier()]

titles = ['0-R',
          'Logistic Regression',
          'GNB',
          'MNB',
          'LinearSVC',
          'Decision Tree',
          'KNN']

for title, model in zip(titles, models):
    
    output = []
    model.fit(Xtrain, y_train)
    y_predict = model.predict(Xtest)
    
    for i in range(len(y_predict)):
        row = [i+1, y_predict[i]]
        output.append(row)
    
    
    output = pd.DataFrame(output, columns = ['id', 'duration_label'])
    output.to_csv(title + '_D2V.csv', index = False)
    


In [8]:
from sklearn.metrics import accuracy_score

#code excerpted from Practical 8
#creates a stacking classifier class given a set of classifiers and a meta-classifier
class StackingClassifier():

    def __init__(self, classifiers, metaclassifier):
        self.classifiers = classifiers
        self.metaclassifier = metaclassifier

    def fit(self, X, y):
        for clf in self.classifiers:
            clf.fit(X, y)
        X_meta = self._predict_base(X)
        self.metaclassifier.fit(X_meta, y)
    
    def _predict_base(self, X):
        yhats = []
        for clf in self.classifiers:
            yhat = clf.predict_proba(X)
            yhats.append(yhat)
        yhats = np.concatenate(yhats, axis=1)
        assert yhats.shape[0] == X.shape[0]
        return yhats
    
    def predict(self, X):
        X_meta = self._predict_base(X)     
        yhat = self.metaclassifier.predict(X_meta)
        return yhat
    def score(self, X, y):
        yhat = self.predict(X)
        return accuracy_score(y, yhat)

In [13]:
#initialise a meta classifier with Logistic Regression as the meta classifier

classifiers = [LogisticRegression(),
          GaussianNB(),
          MultinomialNB(),
          DecisionTreeClassifier(),
          KNeighborsClassifier()]

meta_clf_lr = LogisticRegression()
stacker_lr = StackingClassifier(classifiers, meta_clf_lr)

#create stacked model output
output = []
stacker_lr.fit(Xtrain, y_train)
y_predict = stacker_lr.predict(Xtest)
    
for i in range(len(y_predict)):
    row = [i+1, y_predict[i]]
    output.append(row)
    
output = pd.DataFrame(output, columns = ['id', 'duration_label'])
output.to_csv('stacker' + '_D2V.csv', index = False)