In [1]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import string

stoplist = stopwords.words('english')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


def lower(text):
    result = " ".join([word.lower() for word in text.split()])
    return result


def remove_punc(text):
    remove = str.maketrans('', '', (string.punctuation + '£'))
    return text.translate(remove)


def remove_nums(text):
    remove = str.maketrans('', '', string.digits)
    return text.translate(remove)


def stemmer_nltk(text):
    stemmed = " ".join(stemmer.stem(word) for word in text.split())
    return stemmed

def remove_stopwords(text):
    text = " ".join([word for word in text.split() if word not in stoplist])
    return text

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


def split_dataset(df):
    train, test, ytrain, ytest = train_test_split(df['item_name'], df['category'], test_size=0.2, random_state=88)

    return train, test, ytrain, ytest


def create_labels(train_labels, test_labels, labels):
    encoder = LabelEncoder()

    encoder.fit(labels)

    y_train = encoder.transform(train_labels)
    y_val = encoder.transform(test_labels)

    return y_train, y_val


def get_vectors(train_data, val_data):
    vectorizer = CountVectorizer()

    train_data = vectorizer.fit_transform(train_data)
    val_data = vectorizer.transform(val_data)

    return train_data, val_data


def get_lr_model(x, y, iter=100):
    lr = LogisticRegression(class_weight='balanced', max_iter=iter, n_jobs=8).fit(x, y)

    return lr

def get_nb_model(x,y):
    nb = MultinomialNB().fit(x, y)

    return nb

def get_svm_model(x,y):
    svm = SVC().fit(x,y)

    return svm


def get_acc(m, x, y):
    predictions = m.predict(x)

    acc = np.mean(predictions == y)*100

    return acc

Get data

In [3]:
import pandas as pd

amazon_data = pd.read_csv("amazon-pqa-reduced-100.csv",index_col=0)
amazon_labels = amazon_data.category.unique()
shopmania_data = pd.read_csv("shopmania-reduced-100.csv", index_col=0)
shopmania_labels = shopmania_data.category.unique()
custom_data = pd.read_csv("products_list_final.csv", index_col=0)
custom_labels = custom_data.category.unique()
custom_data.columns = ["store_name", "item_name", "category"]

datasets = {'amazon': [amazon_data.copy(), amazon_labels], 'shopmania': [shopmania_data.copy(), shopmania_labels], 'custom': [custom_data.copy(), custom_labels]}

In [4]:
amazon_data.sample(3)

Unnamed: 0,item_name,category
9720,P10 Projector Eight Core S912 HD DLP Mini Inte...,video_projectors
1249,Sony DCR-DVD910 4MP DVD Handycam Camcorder wit...,camcorders
3191,Weathertech W298-W302-W304 All Weather Floor Mats,floor_mats


In [5]:
shopmania_data.sample(3)

Unnamed: 0,item_name,category
6452,jane iredale liquid minerals a foundation 30ml...,Women Cosmetics
1886,morphic 3603 m36 series mens watch,Watches
4279,pyle hd smart projector with built in dual cor...,Projectors


In [6]:
custom_data.sample(3)

Unnamed: 0,store_name,item_name,category
529,TESCO,Tesco Medium Free Range Eggs 12 Pack,groceries
324,TESCO,F/RANGE EGGS,groceries
142,TESCO,SMIRNOFF ICE,alcohol


Pre-processing, experiment 1

In [7]:
from time import process_time

In [8]:
def run_final(df, name, labels):
    df['item_name'] = df['item_name'].astype(str)
    df['item_name'] = df['item_name'].apply(lower)
    df['item_name'] = df['item_name'].apply(remove_punc)
    df['item_name'] = df['item_name'].apply(remove_nums)
    df['item_name'] = df['item_name'].apply(remove_stopwords)
    df['item_name'] = df['item_name'].apply(stemmer_nltk)

    X_train, X_test, Y_train, Y_test = split_dataset(df)

    Y_train, Y_test = create_labels(Y_train, Y_test, labels)
    X_train, X_test = get_vectors(X_train, X_test)
    lr_model = get_lr_model(X_train, Y_train)

    lr_acc = get_acc(lr_model, X_test, Y_test)
    print(f"Logistic regression accuracy on {name} dataset: {lr_acc:.2f}%")

    return lr_acc

In [9]:
for key, dataset in datasets.items():
    name = key

    start = process_time()

    lr_acc = run_final(dataset[0], name, dataset[1])

    stop = process_time()

    print(f"Time taken to process {name} dataset: {(stop-start)/60:.2f}m")
    datasets[key].append(lr_acc)

Logistic regression accuracy on amazon dataset: 84.85%
Time taken to process amazon dataset: 0.04m
Logistic regression accuracy on shopmania dataset: 78.99%
Time taken to process shopmania dataset: 0.03m
Logistic regression accuracy on custom dataset: 79.55%
Time taken to process custom dataset: 0.00m


In [10]:
results = pd.DataFrame({}, columns=["name", "remove numbers", "nums + stemmed", "nums + remove stopwords", "nums + stemmed + stopwords"])

for key, dataset in datasets.items():
    combined = list([key] + dataset[2:])
    print(combined)
    new_line = pd.DataFrame([combined], columns=["name", "remove numbers", "nums + stemmed", "nums + remove stopwords", "nums + stemmed + stopwords"])

    results = pd.concat([results, new_line])

['amazon', 84.85000000000001]


ValueError: 5 columns passed, passed data had 2 columns

In [None]:
results

Unnamed: 0,name,remove numbers,nums + stemmed,nums + remove stopwords,nums + stemmed + stopwords
0,amazon,84.4,85.05,84.85,84.75
0,shopmania,78.67975,78.813559,78.858162,78.902765
0,custom,81.060606,80.30303,79.545455,79.545455


In [11]:
def reset_data():
    datasets['amazon'][0] = amazon_data.copy()
    datasets['shopmania'][0] = shopmania_data.copy()
    datasets['custom'][0] = custom_data.copy()

In [12]:
reset_data()

for key, dataset in datasets.items():

    df = dataset[0]

    df['item_name'] = df['item_name'].astype(str)
    df['item_name'] = df['item_name'].apply(lower)
    df['item_name'] = df['item_name'].apply(remove_punc)
    df['item_name'] = df['item_name'].apply(remove_nums)
    df['item_name'] = df['item_name'].apply(stemmer_nltk)

    csv_name = '../cleaned/' + key + '-cleaned.csv'

    df.to_csv(csv_name)

In [20]:
reset_data()

In [23]:
df = datasets['custom'][0]

df['item_name'] = df['item_name'].astype(str)
df['item_name'] = df['item_name'].apply(lower)
df['item_name'] = df['item_name'].apply(remove_punc)
df['item_name'] = df['item_name'].apply(remove_nums)
#df['item_name'] = df['item_name'].apply(remove_stopwords)
#df['item_name'] = df['item_name'].apply(stemmer_nltk)

csv_name = '../cleaned/custom-cleaned.csv'

df.to_csv(csv_name)