In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from database import Dataset

TOPIC RECOMMENDER - NMFT

In [2]:
reviewPath = "datasets/yelp_academic_dataset_review.json"
datasetJson = Dataset.get_dataset_json(reviewPath)
reviewDataset = Dataset.construct_dataset(Dataset, datasetJson)
reviewDataset.head()

Getting json lines...
Constructing dataset...
Making column 0
Making column 1
Making column 2
Making column 3
Making column 4
Making column 5
Making column 6
Making column 7
Making column 8


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [3]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial
import math 

class NMFT:
    
    '''
    The data have to be in pattern format.
    '''
    def __init__(self, data:pd.DataFrame, min_df:int = 50, stop_words:str = 'english', n_topics:int = 10,
                 random_state:int = 5) -> None:
        self.data = data
        self.min_df = min_df
        self.stop_words = stop_words
        self.n_topics = n_topics
        self.random_state = random_state
        
    '''
    tdfic_vectorizer() - Create text topics
    parameters = (
        text: List of texts to create topics
        min_df: Minimum to appear in text to be consider relevant
        stop_words: Text language
    )
    '''
    def tdfid_vectorizer(self):
        try:
            review_vector = TfidfVectorizer(min_df = self.min_df, stop_words=self.stop_words)
            embeddings = review_vector.fit_transform(self.data.text)
            self.embeddings = embeddings
        except:
            print("Could not create embeddings.")
        
        
    def create_NMF_model(self):
        embeddings = self.tdfid_vectorizer(self)
        
        model = NMF(n_components=self.n_topics, random_state=self.random_state)
        model.fit(self.embeddings)
        nmf_features = model.transform(self.embeddings)
        self.nmf_features = nmf_features
    
    
    def calculate_user_similarity(self):
        self.features_rows, self.features_cols = self.nmf_features.shape
        current_user = 0
        user_pivot = self.nmf_features[current_user, :]
        most_similar_users = {}
        
        for user in range(self.features_rows):
            if user == current_user: continue
            
            user_similarity = self.nmf_features[user, :]
            sim = 1 - spatial.distance.cosine(user_pivot, user_similarity)
            most_similar_users[user] = sim
            
        self.similar_users = most_similar_users


In [4]:
reviewDatasetPivot = reviewDataset[['user_id', 'text']].copy()
reviewDatasetPivot = reviewDatasetPivot.iloc[:100000, :]

In [5]:
reviewVect = TfidfVectorizer(min_df = 50, stop_words='english')

variables = reviewVect.fit_transform(reviewDatasetPivot.text)

In [6]:
revModel = NMF(n_components=10, random_state=5)

revModel.fit(variables)
nmf_features = revModel.transform(variables)



In [38]:
featuresDf = pd.DataFrame(nmf_features)
featuresDf['user_id'] = reviewDataset['user_id']
featuresDf = featuresDf.groupby(by=['user_id']).sum().reset_index()
featuresDf

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,9
0,---2PmXbF47D870stH1jqA,0.000000,0.000000,0.009972,0.001063,0.019891,0.011636,0.044543,0.000000,0.002177,0.001766
1,--4AjktZiHowEIBCMd4CZA,0.001797,0.006000,0.009745,0.000000,0.000000,0.002521,0.037759,0.007558,0.000000,0.008747
2,--E0uVPphTORm_OiZ5KCvA,0.003359,0.004576,0.041667,0.001687,0.028668,0.003519,0.002110,0.010422,0.000000,0.009426
3,--KMTwCrhKKUmr7riuS4WQ,0.010176,0.002182,0.054440,0.000000,0.008168,0.000000,0.000000,0.004011,0.000000,0.008889
4,--S8M395r8NtOCvS2LRfDw,0.000000,0.003000,0.000000,0.062008,0.004424,0.000364,0.001660,0.016118,0.000385,0.000416
...,...,...,...,...,...,...,...,...,...,...,...
79340,zzljYL9c22HWRO3VvJ1_Pg,0.000472,0.000000,0.018928,0.000248,0.000000,0.000203,0.000000,0.004802,0.000194,0.007794
79341,zzsPf8xNW11nd0B6MZqfRw,0.011304,0.001525,0.008314,0.000567,0.045596,0.000000,0.000000,0.000386,0.001263,0.000516
79342,zzsqjDvanJhH9tn8NautOQ,0.000000,0.000000,0.000000,0.000000,0.030752,0.000000,0.000000,0.023311,0.000000,0.016905
79343,zztkCqqgR6VntYbqio4UTQ,0.014656,0.000000,0.024004,0.000732,0.034627,0.000000,0.001471,0.011206,0.000000,0.004721


In [45]:
import math 

def user_similarity(u1, u2):
    u1Mean = u1.mean()
    u2Mean = u2.mean()
    
    numerator = 0
    denominator = 0
    for item in range(len(u1)):
        numerator += (u1[item] - u1Mean)*(u2[item] - u2Mean)
        
    for item in range(len(u1)):
        denominator += math.pow(u1[item] - u1Mean, 2) * math.pow(u2[item] - u2Mean, 2)
        
    denominator = math.sqrt(denominator)
    sim = numerator / denominator
    return sim
    

In [61]:
from mf import MF
from scipy import spatial

# Pre definition
mostSimilarDataset = reviewDataset[['user_id', 'business_id', 'stars']].copy()
mostSimilarDataset = mostSimilarDataset.iloc[:100000, :]
nb_features_rows = featuresDf.shape[0]

precision_metric = []
for u in range(nb_features_rows):
    print(f"Calculating metrics from user {u}")
    userToRec = featuresDf.iloc[u, 1:]
    usersSim = []

    for user in range(nb_features_rows):
        if user == u: continue
        userS = featuresDf.iloc[user, 1:]
        sim = 1 - spatial.distance.cosine(userToRec, userS)
        usersSim.append(sim)
        
    similarsUsers = []

    for user in range(10):
        maximum = max(usersSim)
        maximum = usersSim.index(maximum)
        usersSim.pop(maximum)
        similarsUsers.append(maximum)
        
    for user in range(len(similarsUsers)):
        similarsUsers[user] = featuresDf.iloc[user, 0]

        msd = mostSimilarDataset.loc[mostSimilarDataset['user_id'].isin(similarsUsers)]
        
    mostSimDataset = pd.pivot_table(msd, index='user_id', columns='business_id', values='stars')
    pivotMatrix = mostSimDataset.values
    pivotMatrix[np.isnan(pivotMatrix)] = 0

    print("Creating matrix fatorization")
    mfModel = MF(pivotMatrix, K = 10, alpha=0.1, beta=0.01, iterations=20)
    mfModel.train()
    
    predictMatrix = mfModel.full_matrix()
    
    recomenToUser = pd.DataFrame(predictMatrix, columns = msd['business_id'].drop_duplicates(keep='first'))
    recomenToUser = recomenToUser.iloc[0, :]
    recomenToUser = recomenToUser.sort_values(ascending=False)
    
    businessToRec = list(recomenToUser[:10].index)
    
    userToRec = featuresDf.iloc[u, 0]
    btr = reviewDataset.loc[reviewDataset['user_id'] == userToRec]
    btr = list(btr['business_id'].values)

    count = 0
    for item in businessToRec:
        if item in btr:
            count += 1
            continue
        
    print(f"Count of user {u} is {count}")
        
    precision_metric.append(count)
    
precision_metric
    

Calculating metrics from user 0


  dist = 1.0 - uv / np.sqrt(uu * vv)


Creating matrix fatorization
Iteration: 10 ; error = 0.3478
Iteration: 20 ; error = 0.0574
Count of user 0 is 1
Calculating metrics from user 1


  dist = 1.0 - uv / np.sqrt(uu * vv)


Creating matrix fatorization
Iteration: 10 ; error = 0.3489
Iteration: 20 ; error = 0.0515
Count of user 1 is 0
Calculating metrics from user 2


  dist = 1.0 - uv / np.sqrt(uu * vv)


Creating matrix fatorization
Iteration: 10 ; error = 0.3159
Iteration: 20 ; error = 0.0551
Count of user 2 is 1
Calculating metrics from user 3


  dist = 1.0 - uv / np.sqrt(uu * vv)


KeyboardInterrupt: 

In [46]:
similarsUsers = []

for u in range(10):
    maximum = max(usersSim)
    maximum = usersSim.index(maximum)
    usersSim.pop(maximum)
    similarsUsers.append(maximum)
    
similarsUsers

[66833, 6576, 31564, 23573, 64781, 45178, 25253, 74215, 44891, 25177]

In [50]:
for u in range(len(similarsUsers)):
    similarsUsers[u] = featuresDf.iloc[u, 0]


mostSimilarDataset = reviewDataset[['user_id', 'business_id', 'stars']]
mostSimilarDataset = mostSimilarDataset.loc[mostSimilarDataset['user_id'].isin(similarsUsers)]
mostSimilarDataset.head()

Unnamed: 0,user_id,business_id,stars
2442,--4AjktZiHowEIBCMd4CZA,EtKSTHV5Qx_Q7Aur9o4kQQ,4.0
5247,--_r6E98SNIrGU7weyNxbw,EC2huvu74EMjrpWdEizbmw,5.0
22278,---2PmXbF47D870stH1jqA,hKameFsaXh9g8WQbv593UA,5.0
33958,--E0uVPphTORm_OiZ5KCvA,dz_aIFbATP2PLWQSOBnMfw,2.0
53154,--ccVMj2PN6Z9qtdOdlung,bSWL0YxfawjS03_g2kgujA,3.0


In [48]:
mostSimilarDataset = pd.pivot_table(mostSimilarDataset, index='user_id', columns='business_id', values='stars')
mostSimilarDataset.fillna(0)

business_id,-IQsXtexaUmIfSBkhBZrKQ,-hn_albK3cs_V8hTUw0x1A,0-pY7oEuTT6j9fa1k-YBnw,06Fw40aGMI_sInznweYLow,0ZsqqzHu1HHkDdIKoivi5g,0dIlxPUw9JXFwVbabWxRvQ,0w_FJ-oa1NnjWYiUQdG-Ow,1An4DxtMmvvSe0HX4viRCA,1Efjww8n7WtKtKTgDqjgdA,1Fs3_bjPQwvSXZHGGlFE-A,...,wCluBbW9nzS7MEMFltMwJw,wUnLSg_GKfEIQ5CQQ770_g,weU3ci5UJGa1T1o5gzNJQg,wm5mQ4cSpvko9WlCq07RFw,wnWdf_IeR8wCnesA1p-3nw,xNFrxpr8lh5RogqFKotUcg,xo4z2wCKcdK7FQ_6EYDhvA,y6HvbaoglrJilu1D28ADVw,yKdBTZEoJssmojLNmPrjig,zLm2PaYrpHkcr63ekwvtBQ
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---2PmXbF47D870stH1jqA,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--4AjktZiHowEIBCMd4CZA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,5.0
--E0uVPphTORm_OiZ5KCvA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--KMTwCrhKKUmr7riuS4WQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--S8M395r8NtOCvS2LRfDw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--UizzbnQlZg7bEv2oXEyg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--WhhlIim6ISY5yhFYZreg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--_H9j6ggxvqhh9nPofZwg,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0
--_r6E98SNIrGU7weyNxbw,0.0,5.0,5.0,3.0,0.0,0.0,0.0,0.0,1.0,4.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
--ccVMj2PN6Z9qtdOdlung,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
pivotMatrix = mostSimilarDataset.values
pivotMatrix[np.isnan(pivotMatrix)] = 0

In [51]:
from mf import MF

mfModel = MF(pivotMatrix, K = 10, alpha=0.1, beta=0.01, iterations=20)
mfModel.train()

Iteration: 10 ; error = 0.6273
Iteration: 20 ; error = 0.2711


[(0, 14.083091020520444),
 (1, 11.805130565870437),
 (2, 8.072501718229725),
 (3, 5.159408149923858),
 (4, 3.4983149999668988),
 (5, 2.24872793245387),
 (6, 1.5414616113389497),
 (7, 1.0680840405986984),
 (8, 0.8049539079996405),
 (9, 0.6272597874933992),
 (10, 0.5107785903794008),
 (11, 0.43626074019522393),
 (12, 0.3921064964095028),
 (13, 0.33217744228943064),
 (14, 0.3279126447623499),
 (15, 0.30466748671547156),
 (16, 0.3058080595231721),
 (17, 0.31682776417969644),
 (18, 0.3097579469447212),
 (19, 0.271073849419836)]

In [52]:
predictMatrix = mfModel.full_matrix()

In [53]:
recomenToUser = pd.DataFrame(predictMatrix, columns = mostSimilarDataset['business_id'].drop_duplicates(keep='first'))
recomenToUser = recomenToUser.iloc[0, :]
recomenToUser = recomenToUser.sort_values(ascending=False)
recomenToUser

business_id
s_8QFKyS5nZ6bT9wlojY_Q    5.728823
jqd4ymljpjd39KtyQB9MJw    5.512980
bncTqUdA8ZPcUkDDmUbqyA    5.477215
igC3UWYb9RF5CXOQOVypMw    5.460546
J8GCrwX8EDT7ULl9iM7cmg    5.444498
                            ...   
weU3ci5UJGa1T1o5gzNJQg    3.806028
9QUkLC8IV2pkNG7PqqQCWA    3.695280
dEpvByPMG0BZ3E9JOem0NQ    3.599024
9pc7HKeHRkBIcrjqcP_OeQ    3.425473
VgFnAjjhK3AnV3u1-UVB2A    3.408284
Name: 0, Length: 192, dtype: float64

In [32]:
businessToRec = list(recomenToUser[:10].index)
businessToRec

['s_8QFKyS5nZ6bT9wlojY_Q',
 'J8GCrwX8EDT7ULl9iM7cmg',
 'bncTqUdA8ZPcUkDDmUbqyA',
 'jqd4ymljpjd39KtyQB9MJw',
 'ZwtD0fpLScdoIKtIGdG26Q',
 'sL6fC0P4C-gyL4E5gacUeQ',
 '0-pY7oEuTT6j9fa1k-YBnw',
 '3IXtb1t1O0U-TRNv1a_VtA',
 'hVji-GdwHIxONtbi7K6dzQ',
 'igC3UWYb9RF5CXOQOVypMw']

In [37]:
userToRec = featuresDf.iloc[0, 0]
btr = reviewDataset.loc[reviewDataset['user_id'] == userToRec]
btr = list(btr['business_id'].values)

precision_metric = []
count = 0
for item in businessToRec:
    if item in btr:
        count += 1
        continue
    
precision_metric.append(count)

1