In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from keras.utils import np_utils


def split_dataset(df):
    train, test, ytrain, ytest = train_test_split(df['item_name'], df['category'], test_size=0.2, random_state=88)

    return train, test, ytrain, ytest


def create_labels(train_labels, test_labels, labels):
    encoder = LabelEncoder()

    encoder.fit(labels)

    y_train = encoder.transform(train_labels)
    y_val = encoder.transform(test_labels)

    return y_train, y_val
    

def create_one_hot_labels(Y_train, Y_test, labels):
    encoder = LabelEncoder()
    encoder.fit(labels)

    Y_train = encoder.transform(Y_train)
    Y_test = encoder.transform(Y_test)

    n_classes = np.max(Y_train) + 1

    Y_train = np_utils.to_categorical(Y_train, n_classes)
    Y_test = np_utils.to_categorical(Y_test, n_classes)

    return Y_train, Y_test, n_classes


def transform_data(train, val):
    vectorizer = CountVectorizer(max_features=5000)
    transformer = TfidfTransformer()

    X_train = vectorizer.fit_transform(train) # BoW
    X_train = transformer.fit_transform(X_train) # TF-IDF

    X_val = vectorizer.transform(val)
    X_val = transformer.transform(X_val)

    print(X_train.shape)

    return X_train, X_val


def get_nb_model(x,y):
    nb = MultinomialNB().fit(x, y)
    return nb


def get_svm_model(x,y):
    svm = LinearSVC(max_iter=1000).fit(x,y)
    return svm


def get_acc(m, x, y):
    predictions = m.predict(x)
    acc = np.mean(predictions == y)*100
    return acc

Get data

In [2]:
import pandas as pd

amazon_data = pd.read_csv("../cleaned/amazon-cleaned.csv",index_col=0)
amazon_labels = amazon_data.category.unique()

shopmania_data = pd.read_csv("../cleaned/shopmania-cleaned.csv", index_col=0)
shopmania_labels = shopmania_data.category.unique()

custom_data = pd.read_csv("../cleaned/custom-cleaned.csv", names=["store_name", "item_name", "category"])
custom_labels = custom_data.category.unique()

datasets = {'amazon': [amazon_data.copy(), amazon_labels], 'shopmania': [shopmania_data.copy(), shopmania_labels], 'custom': [custom_data.copy(), custom_labels]}

In [None]:
amazon_data.sample(3)

In [None]:
shopmania_data.sample(3)

In [None]:
custom_data.sample(3)

Models, experiments

In [3]:
from time import process_time

In [4]:
def run_nb(df, name, labels):

    df['item_name'] = df['item_name'].astype(str)

    X_train, X_test, Y_train, Y_test = split_dataset(df)

    Y_train, Y_test = create_labels(Y_train, Y_test, labels)

    X_train, X_test = transform_data(X_train, X_test)

    nb_model = get_nb_model(X_train, Y_train)

    nb_acc = get_acc(nb_model, X_test, Y_test)
    print(f"Naive Bayes accuracy on {name} dataset: {nb_acc:.2f}%")
    
    return nb_acc

In [None]:
for key, dataset in datasets.items():
    name = key

    print()

    start = process_time()

    lr_acc = run_nb(dataset[0], name, dataset[1])

    stop = process_time()

    print(f"Time taken to process {name} dataset: {(stop-start)/60:.2f}m")
    datasets[key].append(lr_acc)

In [None]:
def run_svm(df, name, labels):

    X_train, X_test, Y_train, Y_test = split_dataset(df)

    Y_train, Y_test = create_labels(Y_train, Y_test, labels)

    X_train, X_test = transform_data(X_train, X_test)

    svm_model = get_svm_model(X_train, Y_train)

    svm_acc = get_acc(svm_model, X_test, Y_test)
    print(f"SVM accuracy on {name} dataset: {svm_acc:.2f}%")
    
    return svm_acc

In [None]:
for key, dataset in datasets.items():
    name = key

    print()

    start = process_time()

    svm_acc = run_svm(dataset[0], name, dataset[1])

    stop = process_time()

    print(f"Time taken to process {name} dataset: {(stop-start)/60:.2f}m")
    datasets[key].append(svm_acc)

In [5]:
from keras.models import Sequential
import keras.layers as l

import tensorflow as tf

def get_nn_model(n_classes):
    m = Sequential()

    m.add(l.Dense(128, activation='relu'))
    m.add(l.Dense(n_classes, activation='softmax'))

    optim = tf.keras.optimizers.Adam()

    m.compile(
    loss="categorical_crossentropy", optimizer=optim, metrics=['accuracy']
    )

    return m

In [6]:
def get_metrics(history, name):
    metrics = [[name, history.history['val_accuracy'][-1], history.history['val_loss'][-1]]]
    
    return history.history['val_accuracy'][-1]

In [14]:
model_graph = Sequential()

In [31]:
def run_nn(df, name, labels):
    df['item_name'] = df['item_name'].astype(str)

    X_train, X_test, Y_train, Y_test = split_dataset(df)

    Y_train, Y_test, n_classes = create_one_hot_labels(Y_train, Y_test, labels)

    X_train, X_test = transform_data(X_train, X_test)

    X_train = X_train.todense()
    X_test = X_test.todense()

    nn_model = get_nn_model(n_classes)

    history = nn_model.fit(X_train, Y_train, batch_size=2, epochs=1, validation_data=(X_test, Y_test))
    if name == 'amazon':
        model_graph = nn_model

    tf.keras.utils.plot_model(nn_model, to_file=name+'.png', show_shapes=True, show_layer_names=True)

    visualizer(nn_model, format='png', view=True)
    metrics = get_metrics(history, name)
    print(f"NN accuracy on {name} dataset: {metrics:.2f}%")
    
    return metrics

In [32]:
for key, dataset in datasets.items():
    name = key

    start = process_time()

    nn_acc = run_nn(dataset[0], name, dataset[1])

    stop = process_time()

    print(f"Time taken to process {name} dataset: {(stop-start)/60:.2f}m")
    datasets[key].append(nn_acc)

(8000, 5000)
NN accuracy on amazon dataset: 0.85%
Time taken to process amazon dataset: 0.15m
(8966, 5000)
NN accuracy on shopmania dataset: 0.72%
Time taken to process shopmania dataset: 0.16m
(400, 396)
NN accuracy on custom dataset: 0.55%
Time taken to process custom dataset: 0.01m


In [30]:
from keras_visualizer import visualizer

visualizer(model_graph, format='png', view=True)

ValueError: Keras Visualizer: Error while visualizing

In [None]:
results = pd.DataFrame({}, columns=["name", "nb", "svm", "nn"])

for key, dataset in datasets.items():
    combined = list([key] + dataset[2:])
    print(combined)
    new_line = pd.DataFrame([combined], columns=["name", "nb", "svm", "nn"])

    results = pd.concat([results, new_line])

In [None]:
def hack(n):
    n = n*100
    return n

In [None]:
results['nn'] = results['nn'].apply(hack)


In [None]:
results.insert(1, 'lr', [85.85, 74.933095, 67.326733])

In [None]:
results

In [None]:
results.to_csv("classic_models.csv")