###### ### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2021 Semester 1

## Assignment 2: Duration Classification with Recipe Data


**Student ID(s): 1004503, 1005418

## Import required libraries

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
#import pickle
import scipy
#from xgboost import XGBClassifier
from collections import defaultdict
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')


#testing

## Pre-process for count vectorizer and doc2vec

In [37]:
# Count Vectoriser process

count_vec_name = scipy.sparse.load_npz("ML2_data/recipe_text_features_countvec/train_name_vec.npz").toarray()
count_vec_steps = scipy.sparse.load_npz("ML2_data/recipe_text_features_countvec/train_steps_vec.npz").toarray()
count_vec_ingr = scipy.sparse.load_npz("ML2_data/recipe_text_features_countvec/train_ingr_vec.npz").toarray()

In [38]:
# Doc2vec process

doc2vec100_name = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/train_name_doc2vec100.csv", header = None).values
doc2vec100_steps = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/train_steps_doc2vec100.csv", header = None).values
doc2vec100_ingr = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/train_ingr_doc2vec100.csv", header = None).values

## Extract data from recipe csv

In [39]:
# Isolate duration label
csv_data = pd.read_csv("ML2_data/recipe_train.csv")

n_steps = csv_data.iloc[:,1]
n_steps = n_steps.values

n_ingr = csv_data.iloc[:,2]
n_ingr = n_ingr.values

duration_array = csv_data.iloc[:,-1]
duration_array = duration_array.values

n_steps_array = []
n_ingr_array = []
for i in range(len(n_steps)):
    n_steps_array.append([n_steps[i]])
    n_ingr_array.append([n_ingr[i]])

In [40]:
train_data = np.hstack([n_steps_array, n_ingr_array, doc2vec100_steps, count_vec_name, count_vec_ingr])

In [56]:
X_train, X_test, y_train, y_test = train_test_split(train_data, duration_array, test_size=0.25, random_state=42, stratify=duration_array)

In [11]:
import gc

del count_vec_name
del count_vec_steps
del count_vec_ingr
del doc2vec100_name 
del doc2vec100_steps
del doc2vec100_ingr 
del n_steps
del n_ingr
del duration_array
del n_steps_array 
del n_ingr_array

gc.collect()



28

In [57]:
#select k-best features
from sklearn.feature_selection import SelectKBest, f_classif

kbest = SelectKBest(score_func=f_classif, k=500)
kbest.fit(X_train, y_train)
X_train = kbest.transform(X_train)
X_test = kbest.transform(X_test)


In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import svm
from xgboost import XGBClassifier

import time

#try different classifiers and evaluate individual model performance
#code adapated from Practical 8

models = [RandomForestClassifier(n_estimators=100, random_state=42),
          DummyClassifier(strategy='most_frequent'),
          LogisticRegression(random_state=42),
          GaussianNB(),
          DecisionTreeClassifier(random_state=42),
          svm.LinearSVC(random_state=42)]

titles = ['Random Forest',
          '0-R',
          'Logistic Regression',
          'GNB',
          'Decision Tree',
         'Linear SVC']

# for title, model in zip(titles, models):
    
#     output = []
#     model.fit(X_train, y_train)
#     y_predict = model.predict(X_test)
    
#     for i in range(len(y_predict)):
#         row = [i+1, y_predict[i]]
#         output.append(row)
    
    
#      output = pd.DataFrame(output, columns = ['id', 'duration_label'])
#      output.to_csv(title + '_CV.csv', index = False)

for model, title in zip(models, titles):
    
    t_start = time.time()
    model.fit(X_train, y_train)
    print(title, "accuracy: ", model.score(X_test, y_test))
    t_end = time.time()

    print("Time elapsed: ", t_end - t_start)



Random Forest accuracy:  0.7864
Time elapsed:  21.77417540550232
0-R accuracy:  0.5062
Time elapsed:  0.002000093460083008
Logistic Regression accuracy:  0.7987
Time elapsed:  2.427544355392456
GNB accuracy:  0.6946
Time elapsed:  0.3120710849761963
Decision Tree accuracy:  0.716
Time elapsed:  8.770989656448364
Linear SVC accuracy:  0.7989
Time elapsed:  39.292195558547974


In [84]:
#XGBoost algo
t_start = time.time()
xgb = XGBClassifier(random_state=0, tree_method='gpu_hist').fit(X_train, y_train)
print("XGB accuracy: ", xgb.score(X_test, y_test))
t_end = time.time()
print("Time elapsed: ", t_end - t_start)

XGB accuracy:  0.8038
Time elapsed:  201.57535600662231


In [82]:
#choose classifiers for stacking algo
classifiers = [RandomForestClassifier(n_estimators=100, random_state=42),
          LogisticRegression(random_state=42),
          svm.LinearSVC(random_state=42)]

class_titles = ['rf',
               'lr',
               'svc']

print(list(zip(class_titles, classifiers)))


[('rf', RandomForestClassifier(random_state=42)), ('lr', LogisticRegression(random_state=42)), ('svc', LinearSVC(random_state=42))]


In [83]:
#use stacking classifier

from sklearn.ensemble import StackingClassifier

t_start = time.time()
clf = StackingClassifier(estimators = list(zip(class_titles, classifiers)))
score = clf.fit(X_train, y_train).score(X_test, y_test)
t_end = time.time()

print("Stacker accuracy:", score)
print("Time elapsed:", t_end-t_start)

Stacker accuracy: 0.8018
Time elapsed: 300.223571062088
