In [0]:
import gensim
import pandas as pd
import numpy as np
import nltk 
nltk.download("popular")
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, make_scorer
from math import sqrt
from nltk.stem.snowball import SnowballStemmer
import time
import sklearn.feature_extraction.text as sktf
from scipy.spatial.distance import cosine

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [0]:
#function for stemming and number of common words 

#regular expresions, stop words removal and stemmer
stemmer = SnowballStemmer('english')
stop_words = set(stopwords.words('english')) 

def str_cleaner_stemmer(s):
    s_clean = (re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)'," ", s))
    return " ".join([stemmer.stem(word) for word in s_clean.lower().split() if word not in stop_words])
 
#function regarding the pre-defined dictionary of spelling mistakes for improved efficiency.
def spell_check(text):
    for key in spell_check_dict:
        text = text.replace(key, spell_check_dict[key])
    return text

In [0]:
df_train_data = pd.read_csv('/content/drive/My Drive/IHU/NLP/Coursework/data/train.csv', encoding="ISO-8859-1")
df_test_data = pd.read_csv('/content/drive/My Drive/IHU/NLP/Coursework/data/test.csv', encoding="ISO-8859-1")
df_attr = pd.read_csv('/content/drive/My Drive/IHU/NLP/Coursework/data/attributes.csv',encoding='ISO-8859-1')
df_pro_desc_data = pd.read_csv('/content/drive/My Drive/IHU/NLP/Coursework/data/product_descriptions.csv',encoding='ISO-8859-1')

In [0]:
#Create a bullets dataframe which combines the bullets from attributes.csv

#list with names of the attributes Bullet01 to Bullet22
#Bullet22 is the max bullet attribute found in this dataset
bullet_list = []
for i in range (1,23):
  if i<10:
    i = '0'+str(i)
  bullet_list.append('Bullet'+str(i))

#Keep only the rows with the bullet attribute for each product_uid
df_bullets_rows = df_attr[df_attr.name.isin(bullet_list)][["product_uid",'name', "value"]]

#transpose the dataframe so Bullet rows become Columns 
#every row now is the product with its values for every bullet (if any)
#fillna with the empty string since not all prodcuts have values in all the bullets
df_bullets_t = (df_bullets_rows.pivot(index='product_uid', columns='name', values='value')
                                .fillna('').reset_index())

#create a column which aggreagates all bullets to one column 
df_bullets_t['bullets'] = df_bullets_t[bullet_list].agg(' '.join, axis=1)

#keep only the combined 
df_bullets = df_bullets_t.filter(['product_uid','bullets'], axis=1)

In [0]:
#Load ready-dictionary which checks spelling for search terms
spell_check_dict= {}

with open("/content/drive/My Drive/IHU/NLP/Coursework/data/spelling_dictionary.txt") as f:
  for line in f:
    spell_check_dict[str(line.split(":")[0])] = str(line.split(":")[1]).strip("\n")

In [0]:
#Concat the train and test data in on DF for efficiency 
#Be carefull sort must be false in order to maintain the correct order
df_all = pd.concat((df_train_data, df_test_data), axis=0, ignore_index=True, sort = False)

#Merge with product description on product uid 
df_all = pd.merge(df_all, df_pro_desc_data, how='left', on='product_uid')

#Merge with material on product uid
df_all = pd.merge(df_all, df_bullets, how='left', on='product_uid')
df_all['bullets'] = df_all['bullets'].replace(np.nan," ")

In [0]:
#Spell Check
df_all['search_term'] = df_all['search_term'].map(lambda x:spell_check(str(x)))

In [0]:
#Perform Cleaning and Stemming at string fields
df_all['search_term'] = df_all['search_term'].map(lambda x:str_cleaner_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_cleaner_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_cleaner_stemmer(x))
df_all['bullets'] = df_all['bullets'].map(lambda x:str_cleaner_stemmer(x))

In [0]:
#Create Product Info column which will be use for creating the documents
#for the vocalbulary genration 
df_all['product_info'] = df_all['product_description']+" "+df_all['search_term']+" "+df_all['product_title']+" "+df_all['bullets']


#Create the docunments each document consists of the search term + prod title
documents =[]

for i in df_all['product_info']:
  documents.append(list(i.split()))

In [0]:
# build vocabulary and train model
model = gensim.models.Word2Vec(
        documents,
        size=150,
        window=25,
        min_count=0,
        workers=10,
        iter=5)

In [0]:
model.train(documents,total_examples=len(documents),epochs=5)

(178847859, 188828655)

In [0]:
#Create the mean vectors for the search term and prodcuct title
# using word2vec from the model created above

df_all['search_term_mean_vector'] = df_all['search_term'].map(lambda x: np.mean([model[w] for w in x.split()], axis = 0))
df_all['prod_title_mean_vector'] = df_all['product_title'].map(lambda x: np.mean([model[w] for w in x.split()], axis = 0))
df_all['prod_desc_mean_vector'] = df_all['product_description'].map(lambda x: np.mean([model[w] for w in x.split()], axis = 0))
df_all['bullets_mean_vector'] = df_all['bullets'].map(lambda x: np.mean([model[w] for w in x.split()], axis = 0))

  
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """


In [0]:
#Compute the cosine similarity between the mean vectors
df_all['st_pt_vector_similarity'] = df_all.apply(lambda row: 1 - cosine(row['search_term_mean_vector'], row['prod_title_mean_vector']), axis=1)
df_all['st_pd_vector_similarity'] = df_all.apply(lambda row: 1 - cosine(row['search_term_mean_vector'], row['prod_desc_mean_vector']), axis=1)
df_all['st_bt_vector_similarity'] = df_all.apply(lambda row: 1 - cosine(row['search_term_mean_vector'], row['bullets_mean_vector']), axis=1)

In [0]:
#Export to csv
from google.colab import files

export_columns = ['id','st_pt_vector_similarity', 'st_pd_vector_similarity','st_bt_vector_similarity']
df_all.loc[:,export_columns].to_csv('vector_similarity.csv', index = False)

In [0]:
# files.download("vector_similarity.csv")

## Save csv directly to drive
# !vector_similarity.csv "drive/My Drive/IHU/NLP/Coursework/"