In [9]:
import pandas as pd
import numpy as np
import pickle
import time
import scipy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# Import train data

import recipe_train.csv

In [10]:
base_data = pd.read_csv("recipe_train.csv").values

seperate n_steps and n_ingredients

In [11]:
n_steps_ingr = []
for i in range(len(base_data)):
    lst = []
    for j in [1,2]:
        lst.append(base_data[i][j])
    n_steps_ingr.append(lst)
    
n_steps_ingr = np.array(n_steps_ingr)

seperate duration_label

In [12]:
duration_label = [i[-1] for i in base_data]
duration_label = np.array(duration_label)

import countvec

In [13]:
countvec_name = scipy.sparse.load_npz('recipe_text_features_countvec/train_name_vec.npz').toarray()
countvec_steps = scipy.sparse.load_npz('recipe_text_features_countvec/train_steps_vec.npz').toarray()
countvec_ingr = scipy.sparse.load_npz('recipe_text_features_countvec/train_ingr_vec.npz').toarray()

In [14]:
countvec_name = countvec_name.astype('int16')
countvec_steps = countvec_steps.astype('int16')
countvec_ingr = countvec_ingr.astype('int16')

import doc2vec100

In [15]:
doc2vec_name = pd.read_csv('recipe_text_features_doc2vec100/train_name_doc2vec100.csv', header=None).values
doc2vec_steps = pd.read_csv('recipe_text_features_doc2vec100/train_steps_doc2vec100.csv', header=None).values
doc2vec_ingr = pd.read_csv('recipe_text_features_doc2vec100/train_ingr_doc2vec100.csv', header=None).values

Merge selected features

In [16]:
arrays = [countvec_name, doc2vec_name, countvec_steps, doc2vec_steps, countvec_ingr, doc2vec_ingr, n_steps_ingr]
data = np.hstack(arrays)

# Create test and train set

In [17]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Rescale the input features
scaler = MinMaxScaler(feature_range=(0,1))
data = scaler.fit_transform(data)

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(data, duration_label, test_size=0.2, random_state=7)

Release memory

In [19]:
import gc

In [20]:
del countvec_name
del countvec_steps
del countvec_ingr
del doc2vec_name
del doc2vec_steps
del doc2vec_ingr
del base_data
del n_steps_ingr
del duration_label
del data

gc.collect()

110

# Export Function

In [35]:
def export_pred(prediction_data, filename):
    prediction_dict = {'id':  [i for i in range(1,len(prediction_data)+1)],'duration_label': prediction_data}
    pred_df = pd.DataFrame (prediction_dict, columns = ['id','duration_label'])
    filename = filename+".csv"
    pred_df.to_csv(filename, index=None)

    return pred_df

# Feature Selection

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif # use this for classification tasks
from sklearn.feature_selection import f_regression # use this for regression tasks
from sklearn.feature_selection import chi2

score_func = chi2

In [14]:
kbest = SelectKBest(score_func=chi2, k=750)
kbest.fit(X_train, Y_train)

SelectKBest(k=750, score_func=<function chi2 at 0x7fe4c621b1f0>)

In [15]:
# Transform (remove features not selected)
X_train = kbest.transform(X_train)
X_test = kbest.transform(X_test)

# Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression

Create logistic regression classfier

In [17]:
lrc = LogisticRegression(max_iter = 1000)

Test lrc on test,train. (0.2=test)

In [18]:
start = time.time()

lrc.fit(X_train, Y_train)

fin = time.time()

print(fin-start, 'seconds')

8.602735042572021 seconds


In [19]:
lrc.score(X_test, Y_test)

0.798125

In [31]:
lrc_predict = lrc.predict(X_test)

In [48]:
export_pred(lrc_predict, "lrc_predict")

Unnamed: 0,id,duration_label
0,1,1.0
1,2,1.0
2,3,1.0
3,4,1.0
4,5,1.0
...,...,...
7995,7996,2.0
7996,7997,1.0
7997,7998,2.0
7998,7999,1.0


# xGBoost

In [20]:
import xgboost as xgb

Create xgboost classifier

In [21]:
xgbc = xgb.XGBClassifier(learning_rate = 0.2, objective='multi:softmax')

Test xgbc on test,train. (0.2=test)

In [22]:
start = time.time()

xgbc.fit(X_train, Y_train)

fin = time.time()

print(fin-start, 'seconds')



141.43824100494385 seconds


In [23]:
xgbc.score(X_test, Y_test)



0.8065

# SVM

In [22]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [23]:
start = time.time()
svm_linear_clf = LinearSVC(random_state=1, C=1, max_iter=10000)
svm_linear_clf.fit(X_train, Y_train)
end = time.time()
print("time: "+str(end-start))

KeyboardInterrupt: 

In [None]:
svm_linear_clf.score(X_test, Y_test)

In [28]:
svm_predict = svm_linear_clf.predict(X_test)

In [47]:
export_pred(svm_predict, "svm_predict")

Unnamed: 0,id,duration_label
0,1,1.0
1,2,1.0
2,3,1.0
3,4,1.0
4,5,1.0
...,...,...
7995,7996,2.0
7996,7997,1.0
7997,7998,2.0
7998,7999,1.0


# Stacking

Import results of base classifiers (if need)

In [24]:
lrc_csv = pd.read_csv('lrc_predict.csv')['duration_label']

In [25]:
svm_csv = pd.read_csv('svm_predict.csv')['duration_label']

In [26]:
xgb_csv = pd.read_csv('xgb_predict.csv')['duration_label']

In [27]:
meta_data = pd.concat([lrc_csv, svm_csv, xgb_csv], axis=1)


In [28]:
X_train_meta, X_test_meta, Y_train_meta, Y_test_meta = \
train_test_split(meta_data, Y_test, test_size=0.2, random_state=7)

In [31]:
meta_classifier = LogisticRegression(max_iter=1000)

In [32]:
meta_classifier.fit(X_train_meta, Y_train_meta)

LogisticRegression(max_iter=1000)

In [33]:
meta_classifier.score(X_test_meta, Y_test_meta)

0.8

In [34]:
meta_classifier.score(X_train_meta, Y_train_meta)

0.80265625

In [44]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

cv_meta_classifier = cross_val_score(LogisticRegression(max_iter=1000), meta_data, Y_test, cv=10)

In [45]:
print(cv_meta_classifier.mean())
print(cv_meta_classifier.min())
print(cv_meta_classifier.max())

0.8026249999999999
0.78625
0.83125


# Predicting Test Set

Train with train set

In [15]:
start = time.time()

xgbc.fit(data, duration_label)

fin = time.time()

print(fin-start, 'seconds')



2595.1640968322754 seconds


import recipe_test.csv

In [19]:
base_data_test = pd.read_csv("recipe_test.csv").values

seperate n_steps and n_ingredients

In [20]:
n_steps_ingr_test = []
for i in range(len(base_data_test)):
    lst = []
    for j in [1,2]:
        lst.append(base_data_test[i][j])
    n_steps_ingr_test.append(lst)
    
n_steps_ingr_test = np.array(n_steps_ingr_test)

import countvec

In [22]:
countvec_name_test = scipy.sparse.load_npz('recipe_text_features_countvec/test_name_vec.npz').toarray()
countvec_steps_test = scipy.sparse.load_npz('recipe_text_features_countvec/test_steps_vec.npz').toarray()
countvec_ingr_test = scipy.sparse.load_npz('recipe_text_features_countvec/test_ingr_vec.npz').toarray()

import doc2vec100

In [23]:
doc2vec_name_test = pd.read_csv('recipe_text_features_doc2vec100/test_name_doc2vec100.csv', header=None).values
doc2vec_steps_test = pd.read_csv('recipe_text_features_doc2vec100/test_steps_doc2vec100.csv', header=None).values
doc2vec_ingr_test = pd.read_csv('recipe_text_features_doc2vec100/test_ingr_doc2vec100.csv', header=None).values

Merge selected features

In [24]:
arrays_test = [countvec_name_test, doc2vec_name_test, countvec_steps_test, doc2vec_steps_test, countvec_ingr_test, doc2vec_ingr_test, n_steps_ingr_test]
data_test = np.hstack(arrays_test)

Release memory

In [25]:
del countvec_name_test
del countvec_steps_test
del countvec_ingr_test
del doc2vec_name_test
del doc2vec_steps_test
del doc2vec_ingr_test

gc.collect()

265

Predict test set

In [26]:
start = time.time()

prediction_data = xgbc.predict(data_test)

fin = time.time()

print(fin-start, 'seconds')

1.4767093658447266 seconds


In [27]:
prediction_dict = {'id':  [i for i in range(1,len(prediction_data)+1)],'duration_label': prediction_data}

pred_df = pd.DataFrame (prediction_dict, columns = ['id','duration_label'])

In [26]:
pred_df

Unnamed: 0,id,duration_label
0,1,2.0
1,2,1.0
2,3,1.0
3,4,1.0
4,5,1.0
...,...,...
9995,9996,2.0
9996,9997,1.0
9997,9998,1.0
9998,9999,2.0


In [28]:
pred_df.to_csv("prediction_xgboost_train_0.8_eta_0.2.csv", index=None)