In [1]:
# Importing relevant packages and establishing the Exadata connection

import pandas as pd
import numpy as np
import getpass
import datetime as dt


In [44]:
df = pd.read_csv('textmining/products.csv')

prods = df.iloc[0:1000, :]
print(prods.head())

   product_id                                       product_name  aisle_id  \
0           1                         Chocolate Sandwich Cookies        61   
1           2                                   All-Seasons Salt       104   
2           3               Robust Golden Unsweetened Oolong Tea        94   
3           4  Smart Ones Classic Favorites Mini Rigatoni Wit...        38   
4           5                          Green Chile Anytime Sauce         5   

   department_id  
0             19  
1             13  
2              7  
3              1  
4             13  


In [45]:
import re as re
import nltk
from nltk.stem.porter import *
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

toker = TreebankWordTokenizer()

stemmer = SnowballStemmer("english")
prods['short_desc'] = prods.product_name.str.lower()

# Defining a function to clean the data and tokenize it 

def text_preprocessor(x):

    tmp = str(x)
    tmp = tmp.lower()
    x_cleaned = tmp.replace('/', ' ').replace('-', ' ').replace('"', '')
    
    stops = stopwords.words("english")   # removing stop words 
    
    tokens = toker.tokenize(x_cleaned)
    meaningful_words = [w for w in tokens if not w in [stops]]
    return( " ".join( meaningful_words ))  # Join the words back into one string separated by space, and return the result.



prods['desc']  = prods['short_desc'].apply(text_preprocessor)

# Removing numbers/special characters and other junk from the product descriptions using regular expressions
prods['new_desc'] = prods['desc'].map(lambda x: re.sub(r'\W+', ' ', x))
prods['wt_desc'] = prods['new_desc'].map(lambda x: re.sub("\d+(\s|g|mth|cm|hr|gram|s|kg|kilograms|cl|ml|ltr|pk| pk|gms|nb|up|\
                                                          |s|x|pack|l|lb|mg|d|in|yo|p|c|mm|qp|er|m|vol|h|oz|th|w|yr|tbsp|rd|nd)", '', x))
prods['final_desc'] = prods['wt_desc'].map(lambda x: re.sub('^\d+\s|\s\d+\s|\s\d+$', ' ', x))

prods_1 = prods[~prods.final_desc.isin([None])] # Removing null descriptions from the data set

prods_1.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-vers

Unnamed: 0,product_id,product_name,aisle_id,department_id,short_desc,desc,new_desc,wt_desc,final_desc
0,1,Chocolate Sandwich Cookies,61,19,chocolate sandwich cookies,chocolate sandwich cookies,chocolate sandwich cookies,chocolate sandwich cookies,chocolate sandwich cookies
1,2,All-Seasons Salt,104,13,all-seasons salt,all seasons salt,all seasons salt,all seasons salt,all seasons salt
2,3,Robust Golden Unsweetened Oolong Tea,94,7,robust golden unsweetened oolong tea,robust golden unsweetened oolong tea,robust golden unsweetened oolong tea,robust golden unsweetened oolong tea,robust golden unsweetened oolong tea
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,smart ones classic favorites mini rigatoni wit...,smart ones classic favorites mini rigatoni wit...,smart ones classic favorites mini rigatoni wit...,smart ones classic favorites mini rigatoni wit...,smart ones classic favorites mini rigatoni wit...
4,5,Green Chile Anytime Sauce,5,13,green chile anytime sauce,green chile anytime sauce,green chile anytime sauce,green chile anytime sauce,green chile anytime sauce


In [46]:
# Using Tf-idf vectorizer from sklearn to create the bag of words and convert them into features

from sklearn.feature_extraction.text import TfidfVectorizer
vector_new = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = "english",\
                             max_features = 40000)

product_bag = vector_new.fit_transform(prods_1.final_desc).toarray()
print(product_bag.shape)

vector_new.get_feature_names()

(1000, 1647)


['absorbency',
 'acai',
 'acaí',
 'acne',
 'acti',
 'acting',
 'action',
 'active',
 'added',
 'ade',
 'adobo',
 'adult',
 'advanced',
 'adzuki',
 'aged',
 'ahoy',
 'aid',
 'air',
 'albacore',
 'alcohol',
 'ale',
 'alfredo',
 'allergy',
 'almond',
 'almondmilk',
 'almonds',
 'aloe',
 'alongs',
 'alpha',
 'aluminum',
 'american',
 'americana',
 'aminos',
 'annie',
 'anti',
 'antibacterial',
 'antioxidant',
 'antiplaque',
 'antiseptic',
 'anytime',
 'apple',
 'apples',
 'applicator',
 'apricot',
 'apricots',
 'aquaphor',
 'argan',
 'armor',
 'aromatherapaes',
 'arrabbiata',
 'arrowroot',
 'artichokes',
 'artificial',
 'artihearts',
 'artisan',
 'arugula',
 'asiago',
 'asian',
 'asparagus',
 'aspirin',
 'assortment',
 'atlantic',
 'auroville',
 'australian',
 'authentic',
 'automatic',
 'autumn',
 'awesome',
 'babies',
 'baby',
 'bachelor',
 'bacon',
 'bag',
 'bagels',
 'bags',
 'baits',
 'bake',
 'baked',
 'bakeware',
 'ball',
 'balm',
 'bamboo',
 'banana',
 'bananas',
 'bar',
 'barbara'

In [47]:
# Concatenating bag of words representation with the original data 

products= pd.concat([prods_1.product_id, prods_1.product_name, prods_1.aisle_id, prods_1.department_id, prods_1.final_desc,
                               pd.DataFrame(product_bag, 
                                            columns= vector_new.get_feature_names(), index = prods_1.index)], axis=1)

prod_shape= pd.DataFrame(product_bag,columns= vector_new.get_feature_names(), index = prods_1.index)

# Looking at the most frequently occuring words and their cumulative score
prod_shape.sum().sort_values(ascending=False).head(100)

organic        27.847803
cheese         14.402114
chocolate      12.279438
chicken        11.538936
free           10.807617
sauce           9.563004
bar             9.432624
tea             9.286392
butter          9.095845
vanilla         8.913695
mix             8.723252
cream           8.614585
yogurt          8.394062
blueberry       8.205332
fruit           7.747023
almond          7.745187
juice           7.665321
original        7.599579
strawberry      7.536889
blend           7.331994
lemon           7.278775
honey           7.208633
turkey          7.198718
vegetable       7.161820
water           7.158457
garlic          7.083952
milk            7.038105
wheat           6.805448
red             6.718069
pasta           6.667271
                 ...    
oil             4.423390
size            4.418972
orange          4.402152
mint            4.359728
dry             4.315021
cheddar         4.212334
classic         4.188389
ice             4.176245
light           4.137568


In [48]:
# Importing relevant packages to calculate cosine distance 

import time as time 
import scipy.sparse
import random
import itertools
import numpy as np

from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

tfidf = vector_new.fit_transform(prods_1.final_desc).toarray() # Getting the tfidf representation again from vector

start_time = time.time()

# Iterating through the list of products to calculate the cosine similarity
b =[]
for i in range(tfidf.shape[0]): 
    a = linear_kernel(tfidf[i], tfidf).flatten() # linear kernel is used as the dot product gives the same result for normalized vetors in lesser time
    b.append(a)

print("--- RUNTIME : %s seconds ---" % (time.time() - start_time))

c = np.array(b)

start_time = time.time()

# Iterating throgh the matrix to pick products with similarity more than 0.5 and also ensuring to not look for the same product
a =[]
for (x,y), k in np.ndenumerate(c):
    if k > 0.5 and x!=y:
        q = products[products['product_id']==prods_1.iloc[x][0]]
        w = products[products['product_id']==prods_1.iloc[y][0]]
        u = np.array(q.iloc[0][10:], dtype=pd.Series)
        v = np.array(w.iloc[0][10:], dtype=pd.Series)
        s = (products.iloc[x][0],products.iloc[x][1], products.iloc[x][3],products.iloc[y][0], products.iloc[y][1], 
             products.iloc[y][3],products.iloc[x][4],products.iloc[y][4],k,scipy.spatial.distance.cosine(u, v))
        a.append(s)     

m = pd.DataFrame(a)

m.to_csv('prod_match.csv', sep=',')

print("--- RUNTIME : %s seconds ---" % (time.time() - start_time))


















--- RUNTIME : 1.566037654876709 seconds ---
--- RUNTIME : 6.506204843521118 seconds ---


In [49]:
# Assigning columns and looking at the results

m.columns = ['primary_product_id','primary_product','primary_dept','secondary_product_id','secondary_product','secondary_dept'
            ,'primary_desc','secondary_desc','cosine_sim','cosine_dist']

m.sort_values('cosine_sim', ascending = False)

Unnamed: 0,primary_product_id,primary_product,primary_dept,secondary_product_id,secondary_product,secondary_dept,primary_desc,secondary_desc,cosine_sim,cosine_dist
139,424,Linguine Pasta,9,972,Linguine No 7 Pasta,9,linguine pasta,linguine no pasta,1.000000,0.000000
318,972,Linguine No 7 Pasta,9,424,Linguine Pasta,9,linguine no pasta,linguine pasta,1.000000,0.000000
279,890,Organic Diced Tomatoes,15,405,100% Organic Diced Tomatoes,15,organic diced tomatoes,organic diced tomatoes,1.000000,0.000000
127,405,100% Organic Diced Tomatoes,15,890,Organic Diced Tomatoes,15,organic diced tomatoes,organic diced tomatoes,1.000000,0.000000
229,690,Vegetable Hearty Garden Soup,21,626,Organic Hearty Garden Vegetable Soup,15,vegetable hearty garden soup,organic hearty garden vegetable soup,0.960653,0.039347
210,626,Organic Hearty Garden Vegetable Soup,15,690,Vegetable Hearty Garden Soup,21,organic hearty garden vegetable soup,vegetable hearty garden soup,0.960653,0.039347
315,969,Petite Green Peas,1,327,"Green Peas, Organic, Petite",1,petite green peas,green peas organic petite,0.952177,0.047823
111,327,"Green Peas, Organic, Petite",1,969,Petite Green Peas,1,green peas organic petite,petite green peas,0.952177,0.047823
80,254,Honey Mustard,13,810,Organic Honey Mustard,13,honey mustard,organic honey mustard,0.925062,0.074938
262,810,Organic Honey Mustard,13,254,Honey Mustard,13,organic honey mustard,honey mustard,0.925062,0.074938
