* Import relevant libraries

In [3]:
import numpy as np
import seaborn as sns
import pandas as pd
from scipy import stats
import barnum
from pprint import pprint
import json

from sklearn.metrics import jaccard_score

from collections import Counter

import matplotlib
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
# import warnings

plt.style.use('fivethirtyeight')

* Import cleaned dataset

In [2]:
df = pd.read_csv('./datasets/cleaned-df.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm

In [4]:
cust_names = list(df.customer_name.unique())
brand_names = list(df.customer_name.unique())

In [5]:
prod_sales = df.apply(lambda x: list([x['customer_name'], x['product_name']]),axis=1) 

* Creation of a dictionary with all customers and the products they have purchased

In [6]:
cust_prods = {}
for cust, prod in prod_sales:
    cust_prods.setdefault(cust, set()).add(prod)

In [7]:
cust_sales = {k:list(cust_prods[k]) for k in list(cust_prods)}

In [8]:
cust_sales

{'Rodrigo Keefe': ['limit Lisa'],
 'Julianna Queen': ['limit Lisa'],
 'Palmer Bankston': ['limit Lisa'],
 'Lupe Pettigrew': ['limit Lisa'],
 'Genaro Cheatham': ['limit Lisa'],
 'Rod Nesbitt': ['limit Lisa'],
 'Jarod Morrow': ['limit Lisa'],
 'Betty Nestor': ['aunt leg', 'limit Lisa'],
 'Ellis Whittle': ['limit Lisa'],
 'Major Geer': ['archeology curler'],
 'Stan Mattos': ['archeology curler'],
 'Joseph Guerin': ['archeology curler'],
 'Jamey Cloud': ['archeology curler'],
 'Inez Beauchamp': ['archeology curler'],
 'Carla Dellinger': ['aunt leg'],
 'Chris Baldwin': ['aunt leg', 'January physician'],
 'Lavern Medley': ['rotate macrame'],
 'Caren Whitt': ['rotate macrame'],
 'Joesph Beeler': ['rotate macrame'],
 'Elden Eddy': ['rotate macrame'],
 'Tyree Lytle': ['rotate macrame'],
 'Billie Millard': ['rotate macrame'],
 'Sheri Stanfield': ['rotate macrame'],
 'Dorene Croft': ['apple lentil'],
 'Louisa Leclair': ['apple lentil'],
 'Philip Boyles': ['apple lentil'],
 'Gale Bloom': ['apple l

* Adjust the dictionary to be able to be used by the TFIDF vectorizer

In [9]:
cust_vecs = pd.Series(cust_sales)
cust_vecs = cust_vecs.apply(lambda x : [', '.join(x) for word in [x]][0])

In [10]:
cust_vecs.head(10)

Rodrigo Keefe                limit Lisa
Julianna Queen               limit Lisa
Palmer Bankston              limit Lisa
Lupe Pettigrew               limit Lisa
Genaro Cheatham              limit Lisa
Rod Nesbitt                  limit Lisa
Jarod Morrow                 limit Lisa
Betty Nestor       aunt leg, limit Lisa
Ellis Whittle                limit Lisa
Major Geer            archeology curler
dtype: object

In [11]:
tfidf = TfidfVectorizer(ngram_range=(1,2),vocabulary=prod_names)

* Creation of TFIDF dataframe for comparison of customers and the products they purchased

In [12]:
cust_tfidfvecs =  pd.DataFrame(tfidf.fit_transform(cust_vecs).todense(),columns=tfidf.get_feature_names())

In [13]:
cust_tfidfvecs.set_index(cust_vecs.index,inplace=True)

In [14]:
cust_tfidfvecs.head(20)

Unnamed: 0,limit Lisa,archeology curler,aunt leg,rotate macrame,apple lentil,ceramic barber,cafe advertisement,gum chronometer,segment ocean,grip decade,...,effect tent,line skin,statistic trail,squirrel paint,Taurus sky,married metal,tortellini respect,check hardboard,conga Susan,larch chair
Rodrigo Keefe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Julianna Queen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Palmer Bankston,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Lupe Pettigrew,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Genaro Cheatham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rod Nesbitt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jarod Morrow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Betty Nestor,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ellis Whittle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Major Geer,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* Now I want to do an SVD on the matrix to reduce the dimensionality and therefor create closer similarities

In [15]:
svd = TruncatedSVD(n_components=1000)

In [16]:
cust_svd = svd.fit(tfidf.fit_transform(cust_vecs)) 

print(cust_svd.explained_variance_ratio_.sum())
print(cust_svd.components_.shape)

0.38539454547111807
(1000, 31332)


In [17]:
cust_svds = pd.DataFrame(cust_svd.transform(cust_tfidfvecs))

In [18]:
cust_svds.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-3.416648e-15,3.854359e-15,-3.468721e-14,1.535174e-13,1.004826e-13,-1.853119e-13,4.592571e-14,-1.442148e-13,1.664121e-13,4.040649e-13,...,5e-06,4e-05,-5e-06,-1.5e-05,-3.1e-05,-6.1e-05,2.1e-05,-5.8e-05,4.4e-05,1.4e-05
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-8.293517e-12,2.039476e-11,4.662492e-11,-5.063094e-11,-1.13056e-11,-2.793769e-11,2.869147e-11,5.053856e-10,-2.129898e-10,-4.603291e-10,...,-0.004298,0.007961,0.020484,0.033568,-0.025538,-0.004673,-0.0025,-0.027174,0.021277,0.024631


* I want to round the values in the matrix to 5 decimal places so that the similarity calculations will be more efficient and will hopefully not take as long to compute

In [19]:
cust_svds = cust_svds.apply(lambda x: round(x,5))

In [20]:
cust_svds['cust_names'] = cust_names

cust_svds.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,cust_names
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Rodrigo Keefe
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Julianna Queen
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Palmer Bankston
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lupe Pettigrew
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Genaro Cheatham
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Rod Nesbitt
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Jarod Morrow
7,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,...,4e-05,-1e-05,-2e-05,-3e-05,-6e-05,2e-05,-6e-05,4e-05,1e-05,Betty Nestor
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Ellis Whittle
9,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,...,0.00796,0.02048,0.03357,-0.02554,-0.00467,-0.0025,-0.02717,0.02128,0.02463,Major Geer


In [21]:
from itertools import islice

In [None]:
# started below at 10:10pm, should run for ~ 10 hours, so should finish at 6am 

In [22]:
cust_cosine = {}
for cust in tqdm(islice(cust_prods,7,18)):
    cosine_scores = {}
    for n in cust_prods:
        cosine_scores[n] = cosine_similarity(cust_svds[cust_svds.cust_names == cust].iloc[:,0:-1],
                                             cust_svds[cust_svds.cust_names == n].iloc[:,0:-1])[0][0]
    cust_cosine[cust] = cosine_scores    

11it [3:57:44, 1286.77s/it]


In [23]:
with open('/Users/elkehansen/Documents/general_assembly/Capstone-Subsmission/Elke/datasets/cust_cosine.json', 'w') as fp:
    json.dump(cust_cosine, fp)
    

In [4]:
with open('./datasets/cust_cosine.json', 'r') as fp:
    data = json.load(fp)

* Now I have 11 customers in a dictionary that I make predictions for

In [9]:
data.keys()

dict_keys(['Betty Nestor', 'Ellis Whittle', 'Major Geer', 'Stan Mattos', 'Joseph Guerin', 'Jamey Cloud', 'Inez Beauchamp', 'Carla Dellinger', 'Chris Baldwin', 'Lavern Medley', 'Caren Whitt'])

In [25]:
def recommend_for_prods2(customer, customer_dict = data):
    """
    Return top five recommended products
      based on the brands in 'prods_set'.
      a
    Takes a set of product names.  
    """
    #creation of a dictionary of each users weighted score against the brands set
    weights = cust_cosine[customer]

    
    #sorting the weighted dictionary by the weight    
    sorted_w = sorted(weights.items(), key=lambda kv: kv[1], reverse=True) 

    
    #creation of a set so the same product doesn't get added multiple times
    unique_prods = set()
    
    for i in sorted_w:
        if i[1] == 1:
            pass
        else:
            for prod in cust_prods[i[0]]:
                    if len(unique_prods) == 5:
                        break
                    elif prod in cust_prods[customer]:
                        break
                    else:
                        unique_prods.add(prod)
                                       
    return unique_prods
    

In [27]:
recommend_for_prods2('Betty Nestor')

{'baritone lyocell',
 'footnote juice',
 'rifle thunderstorm',
 'silica stretch',
 'softball organisation'}