In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import numpy as np
import pandas as pd
import csv
import seaborn as sns
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer
import re
import warnings
import sys

In [8]:
# download the dataset
# !wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/dataset52a7b21.zip

## Data preprocessing

In [6]:
df1  = pd.read_csv('dataset/train.csv', escapechar='\\', quoting=csv.QUOTE_NONE)

In [7]:
df1.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4


In [8]:
df1.shape

(2903024, 5)

In [9]:
#check for null value
df1.isnull().any()

TITLE              True
DESCRIPTION        True
BULLET_POINTS      True
BRAND              True
BROWSE_NODE_ID    False
dtype: bool

In [10]:
#check for min and max label
max_label = np.max(df1.iloc[:, -1].unique())
min_label = np.min(df1.iloc[:, -1].unique())
max_label, min_label

(2893574, 0)

__Removal of duplicated rows is also a crucial part of data preprocessing. Lets check for duplicated rows__

In [11]:
#check for duplicated rows

duplicates = df1[df1.duplicated()]

print("Total number of duplicate rows = ", duplicates.shape[0])
duplicates.head()

Total number of duplicate rows =  101494


Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
2530,Sonari Omania Women's T-Shirt Bra Pack of 2,Omania-wire-free for comfort and support. 3/4t...,"[Wire-free for comfort and support.,3/4th+ cov...",Sonari,773
4914,La Verve Cotton Print Cushion Cover 16X16,Lounging at home gets a stylish dimension cour...,"[Premium Cotton Fabric,Long Washes Durability,...",LA VERVE,912
9615,BeautyNeeds Armband Big Temporary Tattoo Stick...,Type: Temporary Tattoo.Note: Not intended for ...,"[Type: Temporary Tattoo,Easy wear and easy clean]",BeautyNeeds,3482
10201,hangup Sherwani Set one top and one Bottom Set...,,[MOISTURE ABSORBENT::It absorbs twice as much ...,hangup,10200
10237,EVALUZE Printed Soft Silicone Rubber Cover for...,Soft Silicone rubber cover having design print...,[Kindly Check this case is For Oppo A5s launch...,EVALUZE,1045


In [12]:
# remove the duplicate rows
df2 = df1.copy().drop_duplicates(ignore_index=True)
assert df2.shape[0] == df1.shape[0]-duplicates.shape[0]
df2.shape

(2801530, 5)

In [13]:
df2.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4


In [14]:
#### Remove the rows whose label count is less than a given threshold say 50 in this case

In [20]:
#concatenate all the columns together except the target column
def create_concatenated_col(df2):
    df3 = df2.copy()
    df3['CONCATENATED'] = pd.Series(df2[df2.columns[:-1]].fillna('').values.tolist()).str.join(' ')
    return df3

In [23]:
# dfg = create_concatenated_col(df2)
# dfg['CONCATENATED'][0]

In [24]:
df2.columns

Index(['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'BRAND', 'BROWSE_NODE_ID'], dtype='object')

In [25]:
#clean the CONCATENATED  column
stopword = set(stopwords.words('english'))
word_len  = WordNetLemmatizer()

def cleanPunctuation(sentence):
    '''clean the punctuations'''

    cl_sentence  = re.sub(r'[?|!|\'|"|#|\_]',r'',sentence)
    cl_sentence = re.sub(r'[.|,|)|(|\|/]',r'',cl_sentence)
    cl_sentence = re.sub(r'\\',r'',cl_sentence)
    cl_sentence = re.sub(r'[\[\]]',r'',cl_sentence)
    cl_sentence = cl_sentence.strip()
    cl_sentence = cl_sentence.replace("\n"," ")
    return cl_sentence

def keepAlphabet(sentence):
    '''keep only alphabets'''
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-z]+',' ',word)
        alpha_sent +=alpha_word
        alpha_sent+=" "
    alpha_sent =alpha_sent.strip()
    
    return alpha_sent

def remove_stopwords(sentence):
    '''removes stopwords'''
    sentence = sentence.lower()
    re_stop_words = re.compile(r"\b("+"|".join(stopword) + ")\\W", re.I)
    return re_stop_words.sub(" ", sentence)

def lemmatize(sentence):
    '''lemmatization'''
    stemSentence =[]
    
    for word in sentence.split():
        stem  = word_len.lemmatize(word)
        stemSentence.append(stem)
        

    return " ".join(stemSentence)

In [26]:
def clean_sentences(df2):
    df3 = create_concatenated_col(df2)
    #punctuaion cleaning
    df3['CONCATENATED'] = df3['CONCATENATED'].apply(cleanPunctuation)
    print("punctuation cleaning done.")
    
    #remove unncecessary alphabets
    df3['CONCATENATED'] = df3['CONCATENATED'].apply(keepAlphabet)
    print("Unnecessary alphabets removed.")
    
    #remove stopwords
    df3['CONCATENATED'] = df3['CONCATENATED'].apply(remove_stopwords)
    print("Stopwords removed.")
    
    #lemmatization
    df3['CONCATENATED'] = df3['CONCATENATED'].apply(lemmatize)
    print('Lemmatization done.')
    return df3
    

In [27]:
%%time
# clean the sentences
df3 = clean_sentences(df2)

punctuation cleaning done.
Unnecessary alphabets removed.
Stopwords removed.
Lemmatization done.
CPU times: user 30min 56s, sys: 22.1 s, total: 31min 18s
Wall time: 31min 24s


In [33]:
df3.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID,CONCATENATED
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0,pete cat bedtime blue doll inch pete cat coole...
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1,new yorker nyhm refrigerator magnet x new york...
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2,ultimate self sufficiency handbook complete gu...
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3,amway nutrilite kid chewable iron tablet nutri...
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4,teacher planner company lesson academic teache...


Since we have done data cleaning, lets save it for future use.

In [34]:
#save tha cleaned dataframe
temp_df = df3[['BROWSE_NODE_ID', 'CONCATENATED']] # without normalized labels
temp_df.to_csv('cleaned_df_final.csv', index=False)

## Baseline Model

In [72]:
#read the cleaned dataset
df3 = pd.read_csv('cleaned_df_final.csv')

In [73]:
df3.head()

Unnamed: 0,BROWSE_NODE_ID,CONCATENATED
0,0,pete cat bedtime blue doll inch pete cat coole...
1,1,new yorker nyhm refrigerator magnet x new york...
2,2,ultimate self sufficiency handbook complete gu...
3,3,amway nutrilite kid chewable iron tablet nutri...
4,4,teacher planner company lesson academic teache...


In [11]:
df3.shape

(2801530, 2)

In [75]:
len(np.unique(df3.BROWSE_NODE_ID))

9919

In [12]:
from collections import Counter

In [76]:
# #find the count for each labels
counter = Counter(df3['BROWSE_NODE_ID'])

In [77]:
# make a note of all the labels whose count is less than 50
drop_labels = []
cnt = 0
num_rows_total = 0
for a, b in counter.most_common()[::-1]:
    if b < 60:
        cnt+=1
        num_rows_total+=b
        drop_labels.append(a)
print("Total number of labels whose count is less than 10 : ",cnt)
print("Total number of rows to be dropped : ", num_rows_total)

Total number of labels whose count is less than 10 :  6643
Total number of rows to be dropped :  105909


In [78]:
new_df = df3.set_index('BROWSE_NODE_ID')

In [79]:
new_df.shape

(2801530, 1)

In [80]:
#drop the rows whose count is less than a given threshold
df3 = new_df.drop(drop_labels)
df3.shape

(2695621, 1)

In [81]:
df3.reset_index(inplace=True)
df3.head()

Unnamed: 0,BROWSE_NODE_ID,CONCATENATED
0,0,pete cat bedtime blue doll inch pete cat coole...
1,1,new yorker nyhm refrigerator magnet x new york...
2,3,amway nutrilite kid chewable iron tablet nutri...
3,4,teacher planner company lesson academic teache...
4,5,men full sleeve raglan shirt denim shirt size ...


In [82]:
df3.shape

(2695621, 2)

In [83]:
#sanity check
assert df3.shape[0] == new_df.shape[0]-num_rows_total

In [84]:
#print number of missing rows for each col
columns = df3.columns[1:]
print('columns\t\t\tnum_missing_value\n') 
for col in columns:
       print('{0:<15} : {1:>17}'.format(col, df3[col].isna().sum()))

columns			num_missing_value

CONCATENATED    :                50


In [85]:
max_label = np.max(df3.iloc[:, 0].unique())
min_label = np.min(df3.iloc[:, 0].unique())
max_label, min_label

(335582, 0)

In [86]:
## normlaize the labels
def label_normalisation(unique_labels, labels):
    label = labels.copy()
    dict_map = {}
    for i, ele in enumerate(unique_labels):
        dict_map[ele] = i
    for i in range(len(label)):
        label[i] = dict_map[label[i]]
    return label, dict_map
        

In [87]:
#total number of unique labels
unique_labels = df3.iloc[:, 0].unique()
num_labels = len(unique_labels)
num_labels

3276

In [88]:
# map normalized labels

orig_labels = df3['BROWSE_NODE_ID'].to_numpy()
new_labels, dict_map = label_normalisation(unique_labels, orig_labels)
df4 = df3.copy()
df4['BROWSE_NODE_ID'] = new_labels
df4.head()

Unnamed: 0,BROWSE_NODE_ID,CONCATENATED
0,0,pete cat bedtime blue doll inch pete cat coole...
1,1,new yorker nyhm refrigerator magnet x new york...
2,2,amway nutrilite kid chewable iron tablet nutri...
3,3,teacher planner company lesson academic teache...
4,4,men full sleeve raglan shirt denim shirt size ...


In [89]:
max_label = np.max(df4.iloc[:, 0].unique())
min_label = np.min(df4.iloc[:, 0].unique())
max_label, min_label

(3275, 0)

In [90]:
df4.isna().any()

BROWSE_NODE_ID    False
CONCATENATED       True
dtype: bool

In [91]:
df4.dropna(inplace=True)
df4.isna().any()

BROWSE_NODE_ID    False
CONCATENATED      False
dtype: bool

In [92]:
from sklearn.feature_extraction.text import HashingVectorizer

In [93]:
full_text = pd.concat([df4['CONCATENATED'], test_text])

In [94]:
%%time
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,
                               alternate_sign=False)
# vectorizer = TfidfVectorizer(analyzer='word')

vectorizer.fit(full_text)
word_embeddings = vectorizer.transform(df4.CONCATENATED)

CPU times: user 1min 23s, sys: 1.61 s, total: 1min 25s
Wall time: 1min 26s


In [95]:
word_embeddings.shape

(2695571, 262144)

In [96]:
import scipy

In [97]:
#save word embeddings
scipy.sparse.save_npz('/tmp/word_embeddings.npz', word_embeddings)


In [98]:
# split into train and test
x_train, x_dev, y_train, y_dev = train_test_split(word_embeddings, df4['BROWSE_NODE_ID'], random_state=4, test_size = 0.05, shuffle = True)

In [99]:
print("Training data shape : ", x_train.shape, y_train.shape)
print("Development data shape : ", x_dev.shape, y_dev.shape)

Training data shape :  (2560792, 262144) (2560792,)
Development data shape :  (134779, 262144) (134779,)


### dimensionality reduction

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score,accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB

In [101]:
def size_mb(docs):
    return sum(len(s) for s in docs) / 1e6

In [102]:
#get the size of training and dev data
train_size = size_mb(df4.CONCATENATED.values)
print("train data size : {}Mb".format(train_size))

train data size : 1827.200374Mb


In [103]:
import random

In [104]:
import joblib

## Classification

In [105]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB

In [106]:
partial_fit_classifiers = {
#     'SGD': SGDClassifier(loss='log', max_iter=5, n_jobs=-1),
#     'Perceptron': Perceptron(n_jobs=-1),
#     'NB Multinomial': MultinomialNB(alpha=0.01),
    'Passive-Aggressive': PassiveAggressiveClassifier(n_jobs=-1, random_state=42)
}

In [108]:
def get_minibatches(x_train, y_train):
    # Since the dataset size is huge, we will go for partial fit
    CHUNK_SIZE = 40000
    classes = np.unique(df4['BROWSE_NODE_ID'])
    n_iter = (x_train.shape[0]//CHUNK_SIZE)-1
    batches = []
    for i in range(0 ,n_iter):
        x = x_train[i*CHUNK_SIZE:(i+1)*CHUNK_SIZE]
        y = y_train[i*CHUNK_SIZE:(i+1)*CHUNK_SIZE]
        batches.append([x, y])
    return batches



In [109]:
%%time
batches = get_minibatches(x_train, y_train)

CPU times: user 464 ms, sys: 686 ms, total: 1.15 s
Wall time: 1.4 s


In [110]:
len(batches)

63

In [111]:
%%time
classes = np.unique(df4['BROWSE_NODE_ID'])
print(len(classes))
for i, (x,y) in enumerate(batches):
    for name, cls in partial_fit_classifiers.items():
        cls.partial_fit(x, y, classes=classes)
        
        if i%3==0:
            acc = cls.score(x_dev, y_dev)
            print("iter[{}]\t\tclassifier ({})\t\taccuracy ({})".format(i, name, acc))
        else:
            print("learning from step {}".format(i))
        

3276
iter[0]		classifier (Passive-Aggressive)		accuracy (0.5994554047737407)
learning from step 1
learning from step 2
iter[3]		classifier (Passive-Aggressive)		accuracy (0.6765000482270976)
learning from step 4
learning from step 5
iter[6]		classifier (Passive-Aggressive)		accuracy (0.69951550315702)
learning from step 7
learning from step 8
iter[9]		classifier (Passive-Aggressive)		accuracy (0.7099696540262207)
learning from step 10
learning from step 11
iter[12]		classifier (Passive-Aggressive)		accuracy (0.7194444238345736)
learning from step 13
learning from step 14
iter[15]		classifier (Passive-Aggressive)		accuracy (0.7248606978832014)
learning from step 16
learning from step 17
iter[18]		classifier (Passive-Aggressive)		accuracy (0.7300321266666172)
learning from step 19
learning from step 20
iter[21]		classifier (Passive-Aggressive)		accuracy (0.735448400715245)
learning from step 22
learning from step 23
iter[24]		classifier (Passive-Aggressive)		accuracy (0.737859755599908)


In [112]:
clf = partial_fit_classifiers['Passive-Aggressive']
clf

PassiveAggressiveClassifier(n_jobs=-1, random_state=42)

In [113]:
#save the best model
joblib.dump(clf, 'model5.pkl')

['model5.pkl']

In [114]:
#load  model
model = joblib.load('model5.pkl')

## Test Data

In [115]:
df_test = pd.read_csv('dataset/test.csv', escapechar='\\', quoting=csv.QUOTE_NONE)
df_test.head()

Unnamed: 0,PRODUCT_ID,TITLE,DESCRIPTION,BULLET_POINTS,BRAND
0,1,"Command 3M Small Kitchen Hooks, White, Decorat...",Sale Unit: PACK,[INCLUDES - 9 hooks and 12 small indoor strips...,Command
1,2,O'Neal Jump Hardware JAG Unisex-Adult Glove (B...,Synthetic leather palm with double-layer thumb...,[Silicone printing for a better grip. Long las...,O'Neal
2,3,"NFL Detroit Lions Portable Party Fridge, 15.8 ...",Boelter Brands lets you celebrate your favorit...,[Runs on 12 Volt DC Power or 110 Volt AC Power...,Boelter Brands
3,4,Panasonic Single Line KX-TS880MX Corded Phone ...,Features: 50 Station Phonebook Corded Phone Al...,Panasonic Landline Phones doesn't come with a ...,Panasonic
4,5,Zero Baby Girl's 100% Cotton Innerwear Bloomer...,"Zero Baby Girl Panties Set. 100% Cotton, Breat...","[Zero Baby Girl Panties, Pack of 6, 100% Cotto...",Zero


In [116]:
df_test.shape

(110775, 5)

In [117]:
df_test.isna().any()

PRODUCT_ID       False
TITLE             True
DESCRIPTION       True
BULLET_POINTS     True
BRAND             True
dtype: bool

In [118]:
# #clean the test data as well
# df_test_new = clean_sentences(df_test)
# df_test_new.head()

In [119]:
#save the cleaned dataset
# tdf = df_test_new[['PRODUCT_ID', 'CONCATENATED']] # without normalized labels
# tdf.to_csv('cleaned_test_df.csv', index=False)

In [120]:
# read the saved test dataset
test_data = pd.read_csv('cleaned_test_df.csv')
test_data.head()

Unnamed: 0,PRODUCT_ID,CONCATENATED
0,1,Command Small Kitchen Hooks White Decorate Dam...
1,2,ONeal Jump Hardware JAG Unisex Adult Glove Bla...
2,3,NFL Detroit Lions Portable Party Fridge Quart ...
3,4,Panasonic Single Line KX TS MX Corded Phone Wh...
4,5,Zero Baby Girls Cotton Innerwear Bloomer Drawe...


In [121]:
test_text = test_data['CONCATENATED']

In [122]:
%%time
#vectorize it
x_test = vectorizer.transform(test_text)

CPU times: user 4.22 s, sys: 69.3 ms, total: 4.29 s
Wall time: 4.33 s


In [123]:
%%time
pred = model.predict(x_test)

CPU times: user 12.9 s, sys: 4.07 s, total: 17 s
Wall time: 19.3 s


In [124]:
max(pred)

3275

In [125]:
def inverse_dict_map(predicted, dict_map):
    inv_map = {v: k for k, v in dict_map.items()}
    for i in range(len(predicted)):
        predicted[i] = inv_map[predicted[i]]
    return predicted, inv_map

In [126]:
predicted_labels, inv_map = inverse_dict_map(pred.copy(), dict_map)

In [127]:
len(predicted_labels)

110775

In [128]:
df_sub = pd.DataFrame({'PRODUCT_ID' : df_test.PRODUCT_ID, 'BROWSE_NODE_ID':predicted_labels})
df_sub.head()

Unnamed: 0,PRODUCT_ID,BROWSE_NODE_ID
0,1,1140
1,2,15772
2,3,11
3,4,125
4,5,1682


In [129]:
df_sub.to_csv('submission6.csv', index=False)