In [8]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
import time

In [13]:
# Count Vectoriser process

import pickle

n_vocab = pickle.load(open("train_name_countvectorizer.pkl", "rb"))
n_vocab_dict = n_vocab.vocabulary_

s_vocab = pickle.load(open("train_steps_countvectorizer.pkl", "rb"))
s_vocab_dict = s_vocab.vocabulary_

i_vocab = pickle.load(open("train_ingr_countvectorizer.pkl", "rb"))
i_vocab_dict = i_vocab.vocabulary_

train = pd.read_csv('recipe_train.csv')
X_train_raw = train.iloc[:,:-1]
y_train = train.iloc[:,-1]

X_test_raw = pd.read_csv('recipe_test.csv')

In [10]:
# fitting the Count Vectoriser to the elements in 'Name', 'Steps' and 'Ingredients'

Xtr_name_txt = list(X_train_raw['name'])
Xtr_steps_txt = list(X_train_raw['steps'])
Xtr_ingr_txt = list(X_train_raw['ingredients'])

Xte_name_txt = list(X_test_raw['name'])
Xte_steps_txt = list(X_test_raw['steps'])
Xte_ingr_txt = list(X_test_raw['ingredients'])

Xtr_name =  n_vocab.fit_transform(Xtr_name_txt).todense()
Xte_name = n_vocab.transform(Xte_name_txt).todense()

Xtr_steps =  s_vocab.fit_transform(Xtr_steps_txt).todense()
Xte_steps = s_vocab.transform(Xte_steps_txt).todense()

Xtr_ingr =  i_vocab.fit_transform(Xtr_ingr_txt).todense()
Xte_ingr = i_vocab.transform(Xte_ingr_txt).todense()

In [11]:
# Swapping the columns with the CountVectoriser fitted values

X_train = X_train_raw
X_test = X_test_raw

X_train['name'] = Xtr_name
X_train['steps'] = Xtr_steps
X_train['ingredients'] = Xtr_ingr

X_test['name'] = Xte_name
X_test['steps'] = Xte_steps
X_test['ingredients'] = Xte_ingr

In [87]:
# Basic classifiers borrowed from the prac
models = [DummyClassifier(strategy='most_frequent'),
          GaussianNB(),
          MultinomialNB(),
          svm.LinearSVC(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LogisticRegression()]
titles = ['Zero-R',
          'GNB',
          'MNB',
          'LinearSVC',
          'Decision Tree',
          'KNN',
          'Logistic Regression']

for title, model in zip(titles, models):
    exit = []
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    for i in range(len(y_pred)):
        row = [i+1, y_pred[i]]
        exit.append(row)
    exit = pd.DataFrame(exit, columns = ['id', 'duration_label'])
    exit.to_csv(title + '_CV.csv', index = False)



In [26]:
# Stacking Classifiers
GNB = []
DTC = []
KNN = []
# ZR = []
FINAL = []
LR = []

gnb = GaussianNB()
dtc = DecisionTreeClassifier()
knn = KNeighborsClassifier()
lr = LogisticRegression()
# zr = DummyClassifier(strategy='most_frequent')

gnb.fit(X_train, y_train); 
dtc.fit(X_train, y_train); knn.fit(X_train, y_train);  lr.fit(X_train, y_train)
# zr.fit(X_train, y_train);
y_pred_gnb = gnb.predict(X_test); 
y_pred_dtc = dtc.predict(X_test); y_pred_knn = knn.predict(X_test); y_pred_lr = lr.predict(X_test)
#  y_pred_zr = zr.predict(X_test); 

for i in range(len(y_pred_gnb)):
    row = [i+1, y_pred_gnb[i]]
    GNB.append(row)
    
for i in range(len(y_pred_dtc)):
    row = [i+1, y_pred_dtc[i]]
    DTC.append(row)
    
for i in range(len(y_pred_knn)):
    row = [i+1, y_pred_knn[i]]
    KNN.append(row)
    
for i in range(len(y_pred_zr)):
    row = [i+1, y_pred_zr[i]]
    ZR.append(row)
    
for i in range(len(y_pred_lr)):
    row = [i+1, y_pred_lr[i]]
    LR.append(row)

for i in range(len(y_pred_gnb)):
    preds = []
    preds.append(GNB[i][1]); 
    preds.append(DTC[i][1]); preds.append(KNN[i][1]); preds.append(LR[i][1])
#     if set(preds) == {1,2,3}:
#         FINAL.append([i+1, ZR[i][1]])
# #     elif set(preds) == {1,3}:
# #         FINAL.append([i+1,DTC[i][1]])
#     else:
    res = max(set(preds), key = preds.count)
    FINAL.append([i+1, res])

combined = pd.DataFrame(FINAL, columns = ['id', 'duration_label'])
# combined.to_csv('Combined_GNB_DTC_KNN_0R.csv', index = False)
combined.to_csv('Combined_GNB_DTC_KNN_LR.csv', index = False)

In [7]:
# Bag-of-Words process
import scipy
Xtr_name = scipy.sparse.load_npz('train_name_vec.npz')
Xte_name = scipy.sparse.load_npz('test_name_vec.npz')

Xtr_steps = scipy.sparse.load_npz('train_steps_vec.npz')
Xte_steps = scipy.sparse.load_npz('test_steps_vec.npz')

Xtr_ingr = scipy.sparse.load_npz('train_ingr_vec.npz')
Xte_ingr = scipy.sparse.load_npz('test_ingr_vec.npz')

In [None]:
for row in Xtr_ingr