In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

In [2]:
df = pd.read_csv('dataset_food_online.txt', encoding = "ISO-8859-1")

In [3]:
#Check Null values in Dataframe
df.isnull().sum()

business_id    0
date           0
review_id      0
stars          0
text           0
type           0
user_id        0
cool           0
useful         0
funny          0
dtype: int64

In [4]:
import string
import re
def clean_text(text):
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text

In [5]:
yelp_data = df[['business_id', 'user_id', 'stars', 'text']]

In [6]:
%%time
yelp_data['text'] = yelp_data['text'].apply(clean_text)

CPU times: user 3.04 s, sys: 330 ms, total: 3.37 s
Wall time: 3.4 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [7]:
userid_df = yelp_data[['user_id','text']]
business_df = yelp_data[['business_id', 'text']]

In [8]:
userid_df = userid_df.groupby('user_id').agg({'text': ' '.join})
business_df = business_df.groupby('business_id').agg({'text': ' '.join})

In [9]:
#userid vectorizer
userid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=1000)
userid_vectors = userid_vectorizer.fit_transform(userid_df['text'])
userid_vectors.shape
# print(userid_vectors)
# pd.DataFrame(userid_vectors.toarray(), columns=userid_vectorizer.get_feature_names())


(6403, 1000)

In [10]:
#Business id vectorizer
businessid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=1000)
businessid_vectors = businessid_vectorizer.fit_transform(business_df['text'])
businessid_vectors.shape

(4174, 1000)

# Matrix Factorization

In [11]:
userid_rating_matrix = pd.pivot_table(yelp_data, values='stars', index=['user_id'], columns=['business_id'])
userid_rating_matrix.shape

(6403, 4174)

In [12]:
P = pd.DataFrame(userid_vectors.toarray(), index=userid_df.index, columns=userid_vectorizer.get_feature_names())
Q = pd.DataFrame(businessid_vectors.toarray(), index=business_df.index, columns=businessid_vectorizer.get_feature_names())


In [19]:
def matrix_factorization(R, P, Q, steps=100, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        print('Hello')
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    #Sum of squares of the errors in the rating
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        print(e)
        if e<0.001:
            break
        
        print(step)
    return P,Q


In [20]:
%%time
P, Q = matrix_factorization(userid_rating_matrix, P, Q, steps=100, gamma=0.001,lamda=0.02)

Hello
94410.43066203894
0
Hello
93381.72060093234
1
Hello
92350.63352391933
2
Hello
91317.81695811883
3
Hello
90283.91629822039
4
Hello
89249.57185507845
5
Hello
88215.41603573406
6
Hello
87182.07068339906
7
Hello
86150.14460202609
8
Hello
85120.23128572013
9
Hello
84092.90686874308
10
Hello
83068.72830728494
11
Hello
82048.23179970503
12
Hello
81031.93144763845
13
Hello
80020.31815639765
14
Hello
79013.85876952707
15
Hello
78012.99542920396
16
Hello
77018.14515152547
17
Hello
76029.69960356972
18
Hello
75048.02506743163
19
Hello
74073.46257524427
20
Hello
73106.3281984401
21
Hello
72146.91347415502
22
Hello
71195.48595165205
23
Hello
70252.28984199131
24
Hello
69317.54675466583
25
Hello
68391.45650572582
26
Hello
67474.19798280421
27
Hello
66565.93005348009
28
Hello
65666.79250452477
29
Hello
64776.907000711806
30
Hello
63896.37805302121
31
Hello
63025.29398721684
32
Hello
62163.7279048977
33
Hello
61311.73863017446
34
Hello
60469.37163616137
35
Hello
59636.659946404325
36
Hello
58813

(4174, 1000)

In [21]:
#Testing
print('Predict for the given text')
words = 'I am intrested in vegeterian restaruents and I want eat falafal oh my god'
test_df= pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(clean_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:3]
topRecommendations


Unnamed: 0_level_0,Rating
business_id,Unnamed: 1_level_1
WNy1uzcmm_UHmTyR--o5IA,0.536477
vaabk6CYXX1dYVQ1xkyPUg,0.532536
OovMUso3GHEuvwDObeHy0Q,0.462432
R8VwdLyvsp9iybNqRvm94g,0.436837
GfMRsE7rYqNyfC6wUYZkzA,0.432662


In [1]:
#Store P, Q and vectorizer in pickle file
import pickle
output = open('recommendation.pkl', 'wb')
pickle.dump(P,output)
pickle.dump(Q,output)
pickle.dump(userid_vectorizer,output)
output.close()

NameError: name 'P' is not defined