In [1]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import string

stoplist = stopwords.words('english')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


def lower(text):
    result = " ".join([word.lower() for word in text.split()])
    return result


def remove_punc(text):
    remove = str.maketrans('', '', (string.punctuation + '£'))
    return text.translate(remove)


def remove_nums(text):
    remove = str.maketrans('', '', string.digits)
    return text.translate(remove)


def remove_stopwords(text):
    text = " ".join([word for word in text.split() if word not in stoplist])
    return text


def stemmer_nltk(text):
    stemmed = " ".join(stemmer.stem(word) for word in text.split())
    return stemmed


def lemmatizer_nltk(text):
    lemmatized = " ".join(lemmatizer.lemmatize(word) for word in text.split())
    return lemmatized

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


def split_dataset(df):
    train, test, ytrain, ytest = train_test_split(df['item_name'], df['category'], test_size=0.2, random_state=88)

    return train, test, ytrain, ytest


def create_labels(train_labels, test_labels, labels):
    encoder = LabelEncoder()

    encoder.fit(labels)

    y_train = encoder.transform(train_labels)
    y_val = encoder.transform(test_labels)

    return y_train, y_val


def get_vectors(train_data, val_data):
    vectorizer = CountVectorizer()

    train_data = vectorizer.fit_transform(train_data)
    val_data = vectorizer.transform(val_data)

    return train_data, val_data


def get_lr_model(x, y, iter=100):
    lr = LogisticRegression(class_weight='balanced', max_iter=iter, n_jobs=8).fit(x, y)

    return lr

def get_nb_model(x,y):
    nb = MultinomialNB().fit(x, y)

    return nb

def get_svm_model(x,y):
    svm = SVC().fit(x,y)

    return svm


def get_acc(m, x, y):
    predictions = m.predict(x)

    acc = np.mean(predictions == y)*100

    return acc

Get data

In [4]:
import pandas as pd

amazon_data = pd.read_csv("amazon-pqa-reduced-100.csv",index_col=0)
amazon_labels = amazon_data.category.unique()
shopmania_data = pd.read_csv("shopmania-reduced-100.csv", index_col=0)
shopmania_labels = shopmania_data.category.unique()
custom_data = pd.read_csv("products_list_final.csv", index_col=0)
custom_labels = custom_data.category.unique()
custom_data.columns = ["store_name", "item_name", "category"]

datasets = {'amazon': [amazon_data.copy(), amazon_labels], 'shopmania': [shopmania_data.copy(), shopmania_labels], 'custom': [custom_data.copy(), custom_labels]}

In [5]:
custom_data.sample(5, random_state=115)

Unnamed: 0,store_name,item_name,category
574,TESCO,CKN NUGGETS,groceries
557,TESCO,SCRATCHINGS,snacks
56,TESCO,SOFT CHEESE,groceries
380,lidl,Green Tee Pomegrana,drinks
496,LiDL,Tenderstem Broccoli,groceries


In [6]:
amazon_data.sample(3)

Unnamed: 0,item_name,category
3452,MSI AMD Radeon R9 290 4GB GDDR5 2DVI/HDMI/Disp...,graphics_cards
2121,Silverstone Technology CS380B Silverstone DIY ...,computer_cases
5444,Baja Designs Ford F150 2017 Raptor S2 Reverse ...,light_bars


In [7]:
shopmania_data.sample(3)

Unnamed: 0,item_name,category
928,ncaa lightweight water resistant economy canop...,Toys
781,plum organics baby food organic pumpkin and ba...,Feeding
4487,douglas contour 4 .30 1 10 twist ss 4 contour ...,Digital Camera and Camcorder Accessories


In [8]:
custom_data.sample(3)

Unnamed: 0,store_name,item_name,category
235,TESCO,CAFFE LATTE,drinks
7,TESCO,KITCHEN TOWELS,cleaning & laundry
110,T.K.maxx,ACCESSORIES & LIFESTYLE,clothes & accessories


Pre-processing, experiment 1

In [9]:
from time import process_time

In [10]:
def run_remove_punctuation(df, name, labels):
    df['item_name'] = df['item_name'].astype(str)
    df['item_name'] = df['item_name'].apply(lower)
    df['item_name'] = df['item_name'].apply(remove_punc)

    X_train, X_test, Y_train, Y_test = split_dataset(df)
    print("Checkpoint: split dataset")

    Y_train, Y_test = create_labels(Y_train, Y_test, labels)
    print("Checkpoint: encoded labels")

    X_train, X_test = get_vectors(X_train, X_test)
    print("Checkpoint: vectorised inputs")

    print("Checkpoint: starting LR")
    lr_model = get_lr_model(X_train, Y_train)


    lr_acc = get_acc(lr_model, X_test, Y_test)
    print(f"Logistic regression accuracy on {name} dataset: {lr_acc:.2f}%")
    
    return lr_acc

In [11]:
for key, dataset in datasets.items():
    name = key

    start = process_time()

    lr_acc = run_remove_punctuation(dataset[0], name, dataset[1])

    stop = process_time()

    print(f"Time taken to process {name} dataset: {(stop-start)/60:.2f}m")
    datasets[key].append(lr_acc)

Checkpoint: split dataset
Checkpoint: encoded labels
Checkpoint: vectorised inputs
Checkpoint: starting LR
Logistic regression accuracy on amazon dataset: 84.20%
Time taken to process amazon dataset: 0.00m
Checkpoint: split dataset
Checkpoint: encoded labels
Checkpoint: vectorised inputs
Checkpoint: starting LR
Logistic regression accuracy on shopmania dataset: 78.19%
Time taken to process shopmania dataset: 0.00m
Checkpoint: split dataset
Checkpoint: encoded labels
Checkpoint: vectorised inputs
Checkpoint: starting LR
Logistic regression accuracy on custom dataset: 79.55%
Time taken to process custom dataset: 0.00m


In [12]:
def reset_data():
    datasets['amazon'][0] = amazon_data.copy()
    datasets['shopmania'][0] = shopmania_data.copy()
    datasets['custom'][0] = custom_data.copy()

In [13]:
def run_remove_numbers(df, name, labels):
    df['item_name'] = df['item_name'].astype(str)
    df['item_name'] = df['item_name'].apply(lower)
    df['item_name'] = df['item_name'].apply(remove_punc)
    df['item_name'] = df['item_name'].apply(remove_nums)

    X_train, X_test, Y_train, Y_test = split_dataset(df)

    Y_train, Y_test = create_labels(Y_train, Y_test, labels)
    X_train, X_test = get_vectors(X_train, X_test)
    lr_model = get_lr_model(X_train, Y_train)

    lr_acc = get_acc(lr_model, X_test, Y_test)
    print(f"Logistic regression accuracy on {name} dataset: {lr_acc:.2f}%")

    return lr_acc

In [15]:
reset_data()

# Reset dataset to compare
for key, dataset in datasets.items():
    name = key

    start = process_time()

    lr_acc = run_remove_numbers(dataset[0], name, dataset[1])

    stop = process_time()

    print(f"Time taken to process {name} dataset: {(stop-start)/60:.2f}m")
    datasets[key].append(lr_acc)

Logistic regression accuracy on amazon dataset: 84.40%
Time taken to process amazon dataset: 0.00m
Logistic regression accuracy on shopmania dataset: 78.68%
Time taken to process shopmania dataset: 0.00m
Logistic regression accuracy on custom dataset: 81.06%
Time taken to process custom dataset: 0.00m


In [26]:
def run_stemmer(df, name, labels):
    df['item_name'] = df['item_name'].astype(str)
    df['item_name'] = df['item_name'].apply(lower)
    df['item_name'] = df['item_name'].apply(remove_punc)
    df['item_name'] = df['item_name'].apply(stemmer_nltk)

    X_train, X_test, Y_train, Y_test = split_dataset(df)

    Y_train, Y_test = create_labels(Y_train, Y_test, labels)
    X_train, X_test = get_vectors(X_train, X_test)
    lr_model = get_lr_model(X_train, Y_train)

    lr_acc = get_acc(lr_model, X_test, Y_test)
    print(f"Logistic regression accuracy on {name} dataset: {lr_acc:.2f}%")

    return lr_acc

In [27]:
reset_data()

# Reset dataset to compare
for key, dataset in datasets.items():
    name = key

    start = process_time()

    lr_acc = run_stemmer(dataset[0], name, dataset[1])

    stop = process_time()

    print(f"Time taken to process {name} dataset: {(stop-start)/60:.2f}m")
    datasets[key].append(lr_acc)

Logistic regression accuracy on amazon dataset: 84.65%
Time taken to process amazon dataset: 0.04m
Logistic regression accuracy on shopmania dataset: 79.13%
Time taken to process shopmania dataset: 0.03m
Logistic regression accuracy on custom dataset: 78.03%
Time taken to process custom dataset: 0.00m


In [28]:
def run_lemmatizer(df, name, labels):
    df['item_name'] = df['item_name'].astype(str)
    df['item_name'] = df['item_name'].apply(lower)
    df['item_name'] = df['item_name'].apply(remove_punc)
    df['item_name'] = df['item_name'].apply(lemmatizer_nltk)

    X_train, X_test, Y_train, Y_test = split_dataset(df)

    Y_train, Y_test = create_labels(Y_train, Y_test, labels)
    X_train, X_test = get_vectors(X_train, X_test)
    lr_model = get_lr_model(X_train, Y_train,5)

    lr_acc = get_acc(lr_model, X_test, Y_test)

    print(f"Logistic regression accuracy on {name} dataset: {lr_acc:.2f}%")

    return lr_acc

In [29]:
reset_data()

# Reset dataset to compare
for key, dataset in datasets.items():
    name = key

    start = process_time()

    lr_acc = run_lemmatizer(dataset[0], name, dataset[1])

    stop = process_time()

    print(f"Time taken to process {name} dataset: {(stop-start)/60:.2f}m")
    datasets[key].append(lr_acc)

Logistic regression accuracy on amazon dataset: 83.25%
Time taken to process amazon dataset: 0.03m
Logistic regression accuracy on shopmania dataset: 69.89%
Time taken to process shopmania dataset: 0.01m
Logistic regression accuracy on custom dataset: 77.27%
Time taken to process custom dataset: 0.00m


In [30]:
def run_stopword_removal(df, name, labels):
    
    df['item_name'] = df['item_name'].astype(str)
    df['item_name'] = df['item_name'].apply(lower)
    df['item_name'] = df['item_name'].apply(remove_punc)
    df['item_name'] = df['item_name'].apply(remove_stopwords)

    X_train, X_test, Y_train, Y_test = split_dataset(df)

    Y_train, Y_test = create_labels(Y_train, Y_test, labels)
    X_train, X_test = get_vectors(X_train, X_test)
    lr_model = get_lr_model(X_train, Y_train)

    lr_acc = get_acc(lr_model, X_test, Y_test)

    print(f"Logistic regression accuracy on {name} dataset: {lr_acc:.2f}%")

    return lr_acc

In [31]:
reset_data()

# Reset dataset to compare
for key, dataset in datasets.items():
    name = key

    start = process_time()

    lr_acc = run_stopword_removal(dataset[0], name, dataset[1])

    stop = process_time()

    print(f"Time taken to process {name} dataset: {(stop-start)/60:.2f}m")
    datasets[key].append(lr_acc)

Logistic regression accuracy on amazon dataset: 84.00%
Time taken to process amazon dataset: 0.01m
Logistic regression accuracy on shopmania dataset: 78.23%
Time taken to process shopmania dataset: 0.01m
Logistic regression accuracy on custom dataset: 79.55%
Time taken to process custom dataset: 0.00m


In [32]:
results = pd.DataFrame({}, columns=["name", "remove punctuation", "remove numbers", "stemmed", "lemmatized", "remove stopwords"])

for key, dataset in datasets.items():
    combined = list([key] + dataset[2:])
    print(combined)
    new_line = pd.DataFrame([combined], columns=["name", "remove punctuation", "remove numbers", "stemmed", "lemmatized", "remove stopwords"])

    results = pd.concat([results, new_line])

['amazon', 84.2, 84.39999999999999, 84.65, 83.25, 84.0]
['shopmania', 78.18911685994647, 78.67975022301516, 79.1257805530776, 69.89295272078502, 78.23371989295272]
['custom', 79.54545454545455, 81.06060606060606, 78.03030303030303, 77.27272727272727, 79.54545454545455]


In [33]:
results.to_csv("results2.csv")

In [34]:
results

Unnamed: 0,name,remove punctuation,remove numbers,stemmed,lemmatized,remove stopwords
0,amazon,84.2,84.4,84.65,83.25,84.0
0,shopmania,78.189117,78.67975,79.125781,69.892953,78.23372
0,custom,79.545455,81.060606,78.030303,77.272727,79.545455


In [35]:
def run_test(df, name, labels):
    
    df['item_name'] = df['item_name'].astype(str)
    df['item_name'] = df['item_name'].apply(lower)
    df['item_name'] = df['item_name'].apply(remove_punc)
    df['item_name'] = df['item_name'].apply(remove_nums)
    df['item_name'] = df['item_name'].apply(stemmer_nltk)

    X_train, X_test, Y_train, Y_test = split_dataset(df)

    Y_train, Y_test = create_labels(Y_train, Y_test, labels)
    X_train, X_test = get_vectors(X_train, X_test)
    lr_model = get_lr_model(X_train, Y_train)

    lr_acc = get_acc(lr_model, X_test, Y_test)

    print(f"Logistic regression accuracy on {name} dataset: {lr_acc:.2f}%")

    return lr_acc

In [36]:
reset_data()

# Reset dataset to compare
for key, dataset in datasets.items():
    name = key

    start = process_time()

    lr_acc = run_test(dataset[0], name, dataset[1])

    stop = process_time()

    print(f"Time taken to process {name} dataset: {(stop-start)/60:.2f}m")
    datasets[key].append(lr_acc)

Logistic regression accuracy on amazon dataset: 85.05%
Time taken to process amazon dataset: 0.04m
Logistic regression accuracy on shopmania dataset: 78.81%
Time taken to process shopmania dataset: 0.02m
Logistic regression accuracy on custom dataset: 80.30%
Time taken to process custom dataset: 0.00m


In [37]:
reset_data()

for key, dataset in datasets.items():

    df = dataset[0]

    df['item_name'] = df['item_name'].astype(str)
    df['item_name'] = df['item_name'].apply(lower)
    df['item_name'] = df['item_name'].apply(remove_punc)
    df['item_name'] = df['item_name'].apply(remove_nums)
    df['item_name'] = df['item_name'].apply(stemmer_nltk)

    csv_name = '../cleaned/' + key + '.csv'

    df.to_csv(csv_name)

OSError: Cannot save file into a non-existent directory: 'cleaned'