###### ### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2021 Semester 1

## Assignment 2: Duration Classification with Recipe Data


**Student ID(s): 1004503, 1005418

## Import required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
#import pickle
import scipy
#from xgboost import XGBClassifier
from collections import defaultdict
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

## Pre-process for count vectorizer and doc2vec

In [2]:
# Count Vectoriser process

count_vec_name = scipy.sparse.load_npz("ML2_data/recipe_text_features_countvec/train_name_vec.npz").toarray()
count_vec_steps = scipy.sparse.load_npz("ML2_data/recipe_text_features_countvec/train_steps_vec.npz").toarray()
count_vec_ingr = scipy.sparse.load_npz("ML2_data/recipe_text_features_countvec/train_ingr_vec.npz").toarray()

test_count_vec_name = scipy.sparse.load_npz("ML2_data/recipe_text_features_countvec/test_name_vec.npz").toarray()
test_count_vec_steps = scipy.sparse.load_npz("ML2_data/recipe_text_features_countvec/test_steps_vec.npz").toarray()
test_count_vec_ingr = scipy.sparse.load_npz("ML2_data/recipe_text_features_countvec/test_ingr_vec.npz").toarray()


In [3]:
# Doc2vec process

doc2vec100_name = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/train_name_doc2vec100.csv", header = None).values
doc2vec100_steps = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/train_steps_doc2vec100.csv", header = None).values
doc2vec100_ingr = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/train_ingr_doc2vec100.csv", header = None).values

test_doc2vec100_name = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/test_name_doc2vec100.csv", header = None).values
test_doc2vec100_steps = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/test_steps_doc2vec100.csv", header = None).values
test_doc2vec100_ingr = pd.read_csv("ML2_data/recipe_text_features_doc2vec100/test_ingr_doc2vec100.csv", header = None).values

## Extract data from recipe csv

In [4]:
# Isolate duration label
csv_data = pd.read_csv("ML2_data/recipe_train.csv")
test_csv_data = pd.read_csv("ML2_data/recipe_test.csv")

#take raw features from train dataset
n_steps = csv_data.iloc[:,1]
n_steps = n_steps.values

n_ingr = csv_data.iloc[:,2]
n_ingr = n_ingr.values

#take raw features from test dataset
test_n_steps = test_csv_data.iloc[:,1]
test_n_steps = test_n_steps.values

test_n_ingr = test_csv_data.iloc[:,2]
test_n_ingr = test_n_ingr.values

#take class labels from train dataset as duration_array
duration_array = csv_data.iloc[:,-1]
duration_array = duration_array.values

n_steps_array = []
n_ingr_array = []
test_n_steps_array = []
test_n_ingr_array = []

for i in range(len(n_steps)):
    n_steps_array.append([n_steps[i]])
    n_ingr_array.append([n_ingr[i]])
    
for i in range(len(test_n_steps)):
    test_n_steps_array.append([test_n_steps[i]])
    test_n_ingr_array.append([test_n_ingr[i]])

In [5]:
train_data = np.hstack([n_steps_array,
                        n_ingr_array,
                        doc2vec100_steps,
                        doc2vec100_name,
                        doc2vec100_ingr,
                        count_vec_name,
                        count_vec_ingr])

In [6]:
test_data = np.hstack([test_n_steps_array,
                       test_n_ingr_array,
                       test_doc2vec100_steps,
                       test_doc2vec100_name,
                       test_doc2vec100_ingr,
                       test_count_vec_name,
                       test_count_vec_ingr])

In [7]:
X_train, X_test, y_train = train_data, test_data, duration_array

In [8]:
import gc

del count_vec_name
del count_vec_steps
del count_vec_ingr
del doc2vec100_name 
del doc2vec100_steps
del doc2vec100_ingr 
del n_steps
del n_ingr
del n_steps_array 
del n_ingr_array

gc.collect()



80

In [9]:
s_X_train, s_X_test, s_y_train, s_y_test = train_test_split(train_data, duration_array, test_size = 0.25)

In [10]:
#select k-best features
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

kbest = SelectKBest(score_func=f_classif, k=500)
kbest.fit(X_train, y_train)
X_train = kbest.transform(X_train)
X_test = kbest.transform(X_test)

s_X_train = kbest.transform(s_X_train)
s_X_test = kbest.transform(s_X_test)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from xgboost import XGBClassifier

import time

In [None]:

#try different classifiers and evaluate individual model performance
#code adapated from Practical 8

models = [RandomForestClassifier(n_estimators=100),
          LogisticRegression(),
          GaussianNB(),
          DecisionTreeClassifier(),
          svm.LinearSVC(),
          XGBClassifier(tree_method='gpu_hist')]

titles = ['Random Forest',
          'Logistic Regression',
          'GNB',
          'Decision Tree',
         'Linear SVC',
         'XGB']

#add output rows to csv files for eeach model
for title, model in zip(titles, models):
  
    output = []
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
  
    for i in range(len(y_predict)):
        row = [i+1, y_predict[i]]
        output.append(row)
  
    output = pd.DataFrame(output, columns = ['id', 'duration_label'])
    output.to_csv(title + '_CV.csv', index = False)

#for model, title in zip(models, titles):
#    
#    t_start = time.time()
#    model.fit(X_train, y_train)
#    print(title, "accuracy: ", model.score(X_test, y_test))
#    t_end = time.time()
#
#    print("Time elapsed: ", t_end - t_start)



In [13]:
#run models on train-test split

models = [RandomForestClassifier(n_estimators=64),
          LogisticRegression(),
          XGBClassifier(tree_method='gpu_hist')]

titles = ['Random Forest',
          'Logistic Regression',
         'XGB']

#add output rows to csv files for eeach model
for title, model in zip(titles, models):
  
    output = []
    model.fit(s_X_train, s_y_train)
    score = model.score(s_X_test, s_y_test)
    print(title, "accuracy:", score)
  
    

Random Forest accuracy: 0.7109
Logistic Regression accuracy: 0.7512
XGB accuracy: 0.7483


In [None]:
#XGBoost algo
#t_start = time.time()
#xgb = XGBClassifier(random_state=0, tree_method='gpu_hist').fit(X_train, y_train)
#
#
#t_end = time.time()
#print("Time elapsed: ", t_end - t_start)

In [None]:
#choose classifiers for stacking algo
classifiers = [RandomForestClassifier(n_estimators=100),
              LogisticRegression(),
              XGBClassifier(tree_method='gpu_hist')]

class_titles = ['rf',
               'lr',
               'xgb']

print(list(zip(class_titles, classifiers)))


In [None]:
#use stacking classifier

from sklearn.ensemble import StackingClassifier

t_start = time.time()
clf = StackingClassifier(estimators = list(zip(class_titles, classifiers)))
clf.fit(X_train, y_train)
t_end = time.time()

y_predict = clf.predict(X_test)
output = []

#print stacking classifier rows to csv file
for i in range(len(y_predict)):
    row = [i+1, y_predict[i]]
    output.append(row)
print("Time elapsed:", t_end-t_start)

output = pd.DataFrame(output, columns = ['id', 'duration_label'])
output.to_csv('Final' + '_Stacker.csv', index = False)