In [1]:
from dataset.dataset import WhatsCookingDataset
from dataset.dataset import WhatsCookingStemmedDataset
from dataset.dataset import WhatsCookingStemmedSeparatedDataset

from base_solver import BaseSolver
from SVM import SVC_solver

from base_data_processor import BaseDataProcessor
from processors.simple_ingredients_encoder import SimpleIngredientsEncoder
from processors.tf_idf import TfIdf

from sklearn.feature_extraction.text import TfidfVectorizer

import importlib  # for reloading module

In [2]:
# dataset loading
dataset = WhatsCookingDataset()
train_y = [dataset.cuisine2id[cuisine.cuisine] for cuisine in dataset.cuisines]

test_cuisines = dataset.load_test_file()

# load stemmed dataset
dataset_stemmed = WhatsCookingStemmedSeparatedDataset(stem=False)
train_x_stemmed = dataset_stemmed.cuisines

train_y_stemmed = [
    dataset_stemmed.cuisine2id[cuisine.cuisine] 
    for cuisine in dataset_stemmed.cuisines
]

test_cuisines_stemmed = dataset_stemmed.load_test_file()

Loading What's Cooking training dataset ...
100%|██████████| 39774/39774 [00:00<00:00, 58169.56it/s]
100%|██████████| 9944/9944 [00:00<00:00, 204579.12it/s]
Successfully loaded What's Cooking training dataset!
Loading What's Cooking testing dataset ...
Successfully loaded What's Cooking testing dataset!
Loading and stemming separated What's Cooking training dataset ...
100%|██████████| 39774/39774 [01:37<00:00, 409.01it/s]
  0%|          | 0/9944 [00:00<?, ?it/s]Successfully loaded stemmed and separated What's Cooking training dataset!
# of cuisines = 20; # of ingredients = 3082
Loading and stemming separated What's Cooking testing dataset ...
100%|██████████| 9944/9944 [00:18<00:00, 523.89it/s]Successfully loaded stemmed and separated What's Cooking testing dataset!



In [3]:
# pre-processing
train_as_text = [' '.join(cuisine.ingredients).lower() for cuisine in dataset_stemmed.cuisines]
test_as_text = [' '.join(cuisine.ingredients).lower() for cuisine in test_cuisines_stemmed]
tfidf_enc = TfidfVectorizer(binary=True)
train_x_tfidf = tfidf_enc.fit_transform(train_as_text).astype('float16')
test_x_tfidf = tfidf_enc.transform(test_as_text).astype('float16')

In [6]:
# training and testing
importlib.reload(SVC_solver) 

s = SVC_solver.SVCSolver(dataset, method='lsvc_ovr')
print('Now solving using SVC with {}'.format(s.method))
s.train(train_x_tfidf, train_y_stemmed)
s.test(test_x_tfidf, test_cuisines)

Now solving using SVC with lsvc_ovr
Training started...
training time: 32.89413404464722 seconds
training score: 0.8804243978478403
Saving model...
Testing started...
