In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
from collections import defaultdict
from surprise import SVD
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split


# filterwarnings to ignore all unnecessary warnings and logs
import warnings
warnings.filterwarnings('ignore')

In [2]:
pur1 = pd.read_csv("phone_user_review_file_1.csv", encoding = "ISO-8859-1")
pur2 = pd.read_csv("phone_user_review_file_2.csv", encoding = "ISO-8859-1")
pur3 = pd.read_csv("phone_user_review_file_3.csv", encoding = "ISO-8859-1")
pur4 = pd.read_csv("phone_user_review_file_4.csv", encoding = "ISO-8859-1")
pur5 = pd.read_csv("phone_user_review_file_5.csv", encoding = "ISO-8859-1")
pur6 = pd.read_csv("phone_user_review_file_6.csv", encoding = "ISO-8859-1")

In [3]:
df = pd.concat([pur1,pur2,pur3,pur4,pur5,pur6])

In [4]:
df.head()

Unnamed: 0,phone_url,date,lang,country,source,domain,score,score_max,extract,author,product
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10.0,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,phonearena.com,10.0,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,amazon.com,6.0,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl..."
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,samsung.com,9.2,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T)
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4.0,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8


In [5]:
df.shape

(1415133, 11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1415133 entries, 0 to 163836
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   phone_url  1415133 non-null  object 
 1   date       1415133 non-null  object 
 2   lang       1415133 non-null  object 
 3   country    1415133 non-null  object 
 4   source     1415133 non-null  object 
 5   domain     1415133 non-null  object 
 6   score      1351644 non-null  float64
 7   score_max  1351644 non-null  float64
 8   extract    1395772 non-null  object 
 9   author     1351931 non-null  object 
 10  product    1415132 non-null  object 
dtypes: float64(2), object(9)
memory usage: 129.6+ MB


In [7]:
df.score.max()

10.0

In [8]:
df.score_max.max()

10.0

#### score and score_max have max value of 10.0, therefore we can reduce type of both these fields to float32 to save the memory.

In [9]:
df['score'] = df['score'].astype('float32')
df['score_max'] = df['score_max'].astype('float32')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1415133 entries, 0 to 163836
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   phone_url  1415133 non-null  object 
 1   date       1415133 non-null  object 
 2   lang       1415133 non-null  object 
 3   country    1415133 non-null  object 
 4   source     1415133 non-null  object 
 5   domain     1415133 non-null  object 
 6   score      1351644 non-null  float32
 7   score_max  1351644 non-null  float32
 8   extract    1395772 non-null  object 
 9   author     1351931 non-null  object 
 10  product    1415132 non-null  object 
dtypes: float32(2), object(9)
memory usage: 118.8+ MB


#### We reduced some memory usage 

In [11]:
df.head()

Unnamed: 0,phone_url,date,lang,country,source,domain,score,score_max,extract,author,product
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10.0,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,phonearena.com,10.0,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,amazon.com,6.0,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl..."
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,samsung.com,9.2,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T)
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4.0,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8


In [12]:
df.isnull().sum()

phone_url        0
date             0
lang             0
country          0
source           0
domain           0
score        63489
score_max    63489
extract      19361
author       63202
product          1
dtype: int64

### Product has 1 null value and can be removed as we need random 1000 samples. 

In [13]:
df_sample = df[df['product'].notna()]

In [14]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1415132 entries, 0 to 163836
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   phone_url  1415132 non-null  object 
 1   date       1415132 non-null  object 
 2   lang       1415132 non-null  object 
 3   country    1415132 non-null  object 
 4   source     1415132 non-null  object 
 5   domain     1415132 non-null  object 
 6   score      1351643 non-null  float32
 7   score_max  1351643 non-null  float32
 8   extract    1395771 non-null  object 
 9   author     1351931 non-null  object 
 10  product    1415132 non-null  object 
dtypes: float32(2), object(9)
memory usage: 118.8+ MB


In [15]:
df_sample.duplicated().sum()

6412

### There are 6412 duplicate records is the current dataframe. We can drop duplicate rows.

In [16]:
dfnew = df_sample.drop_duplicates()

In [17]:
dfnew.shape

(1408720, 11)

In [18]:
dfnew.isnull().sum()

phone_url        0
date             0
lang             0
country          0
source           0
domain           0
score        63093
score_max    63093
extract      19014
author       61816
product          0
dtype: int64

In [19]:
data = dfnew.fillna(dfnew.median())

In [20]:
data.isnull().sum()

phone_url        0
date             0
lang             0
country          0
source           0
domain           0
score            0
score_max        0
extract      19014
author       61816
product          0
dtype: int64

### For Author and Extract, we can replace NaN values with NA(Not Available).

In [21]:
data.extract = data.extract.replace(np.NaN, 'NA')
data.author = data.author.replace(np.NaN, 'NA')

In [22]:
data.isnull().sum()

phone_url    0
date         0
lang         0
country      0
source       0
domain       0
score        0
score_max    0
extract      0
author       0
product      0
dtype: int64

In [23]:
data.score_max.value_counts()

10.0    1408720
Name: score_max, dtype: int64

#### We have only 1 value in score_max. We can round off values in Score feature.

In [24]:
data.score = data.score.round()

In [25]:
data.score.value_counts()

10.0    662591
8.0     304221
2.0     128332
6.0     117865
9.0     103518
4.0      73039
7.0       8737
5.0       4700
1.0       3182
3.0       2493
0.0         42
Name: score, dtype: int64

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1408720 entries, 0 to 163836
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   phone_url  1408720 non-null  object 
 1   date       1408720 non-null  object 
 2   lang       1408720 non-null  object 
 3   country    1408720 non-null  object 
 4   source     1408720 non-null  object 
 5   domain     1408720 non-null  object 
 6   score      1408720 non-null  float32
 7   score_max  1408720 non-null  float32
 8   extract    1408720 non-null  object 
 9   author     1408720 non-null  object 
 10  product    1408720 non-null  object 
dtypes: float32(2), object(9)
memory usage: 118.2+ MB


In [27]:
data['score'] = data['score'].astype('int32')
data['score_max'] = data['score_max'].astype('int32')

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1408720 entries, 0 to 163836
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   phone_url  1408720 non-null  object
 1   date       1408720 non-null  object
 2   lang       1408720 non-null  object
 3   country    1408720 non-null  object
 4   source     1408720 non-null  object
 5   domain     1408720 non-null  object
 6   score      1408720 non-null  int32 
 7   score_max  1408720 non-null  int32 
 8   extract    1408720 non-null  object
 9   author     1408720 non-null  object
 10  product    1408720 non-null  object
dtypes: int32(2), object(9)
memory usage: 118.2+ MB


In [29]:
data1 =data.copy()

In [30]:
data.drop(['phone_url','date','lang','country','source','domain','score_max','extract'], axis = 1, inplace = True)

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1408720 entries, 0 to 163836
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   score    1408720 non-null  int32 
 1   author   1408720 non-null  object
 2   product  1408720 non-null  object
dtypes: int32(1), object(2)
memory usage: 37.6+ MB


In [32]:
data2 = data.copy()

In [33]:
dataset = data.sample(n=1000000, random_state=612)

In [34]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 232368 to 314560
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   score    1000000 non-null  int32 
 1   author   1000000 non-null  object
 2   product  1000000 non-null  object
dtypes: int32(1), object(2)
memory usage: 26.7+ MB


## Most rated Feature

In [35]:
dataset.groupby('product')['score'].mean().sort_values(ascending=False).head()  

product
Nokia 6060 - TÃ©lÃ©phone cellulaire - GSM                                                                                  10.0
Nokia Unlocked Nokia 1320 RM 995 Yellow 8 GB (LTE Version)                                                                 10.0
Nokia XL Black                                                                                                             10.0
Nokia XL - White                                                                                                           10.0
Nokia XL - Smartphone libre Android (pantalla 5", cÃ¡mara 5 Mp, 4 GB, Dual-Core 1 GHz, 768 MB RAM), naranja [importado]    10.0
Name: score, dtype: float64

### Nokia 6060 - TÃ©lÃ©phone cellulaire - GSM is the most rated features

## Users with most number of reviews 

In [36]:
#Need to remove NA as an Author where NA means information not available
nna = dataset[dataset['author'] != 'NA']
nna['author'].value_counts().head()

Amazon Customer    54543
Cliente Amazon     13630
e-bit               5965
Client d'Amazon     5501
Amazon Kunde        3295
Name: author, dtype: int64

### Amazon Customer is the user with most number of reviews

In [37]:
df_auth = pd.DataFrame(columns=['author', 'counter'])

df_auth['author']=dataset['author'].value_counts().index.tolist() 
df_auth['counter'] = list(dataset['author'].value_counts() > 50)

In [38]:
indexes = df_auth[df_auth['counter'] == False].index

In [39]:
#dropping those indexes
df_auth.drop(indexes, inplace = True) 
df_auth.head()

Unnamed: 0,author,counter
0,Amazon Customer,True
1,,True
2,Cliente Amazon,True
3,e-bit,True
4,Client d'Amazon,True


In [40]:
#Similar steps for product
df_prod = pd.DataFrame(columns=['product', 'counter'])
df_prod['product']=dataset['product'].value_counts().index.tolist() 
df_prod['counter'] = list(dataset['product'].value_counts() > 50)
indexes = df_prod[df_prod['counter'] == False].index
df_prod.drop(indexes, inplace = True)
df_prod.head()

Unnamed: 0,product,counter
0,"Lenovo Vibe K4 Note (White,16GB)",True
1,"Lenovo Vibe K4 Note (Black, 16GB)",True
2,"OnePlus 3 (Graphite, 64 GB)",True
3,"OnePlus 3 (Soft Gold, 64 GB)",True
4,Samsung Galaxy Express I8730,True


In [41]:
#products from dataset
products = dataset[dataset['product'].isin(df_prod['product'])]

## Products having more than 50 ratings 

In [42]:
products.shape

(557576, 3)

In [87]:
products

Unnamed: 0,score,author,product
80369,2,Nickolas Gudmundson,"Apple iPhone 4S Verizon Cellphone, 16GB, White"
283355,6,vinod kumar,Nokia 5030
114568,8,Amazon Customer,ZTE Z Max 2 16GB Unlocked GSM 4G LTE Quad-Core...
74871,8,Rita Goes,Samsung Smartphone Samsung Galaxy S III Neo Du...
105717,8,NEVORA,Nokia 6170
...,...,...,...
117481,8,shivangi,"OnePlus 3T (Soft Gold, 6GB RAM + 64GB memory)"
88941,2,Sandra,"Samsung Galaxy S5 mini Smartphone (4,5 Zoll (1..."
17231,10,Arj,DOOGEE X5 Max Pro 5.0'' IPS 4G Smartphone Andr...
279967,6,Lloyd,"LG G4 US991 32GB Smartphone (Unlocked, Black L..."


In [43]:
authors =  dataset[dataset['author'].isin(df_auth['author'])]

## Users who have given more than 50 ratings

In [88]:
authors

Unnamed: 0,score,author,product
61365,10,Giorgio,"Huawei P9 Plus Smartphone, LTE, Display 5.5'' ..."
114568,8,Amazon Customer,ZTE Z Max 2 16GB Unlocked GSM 4G LTE Quad-Core...
368948,6,Amazon Customer,"Samsung Galaxy Grand Prime SM-G530H (White, 8GB)"
91785,6,Amazon Customer,BLU NEO 4.5 Unlocked (White Black)
311114,9,,Samsung Galaxy S3 I9300 16 GB Beyaz Cep Telefonu
...,...,...,...
237171,10,Amazon Customer,Asus Zenfone 2 Laser 5.5 ZE550KL-1B122IN (White)
331699,10,,Sony Ericsson W880i
338246,6,Amazon Customer,Vodafone Smart First 6 Pay As You Go Handset S...
58733,8,,Lenovo A2010


In [44]:
authors.shape

(211174, 3)

## A popularity based model with top 5 models

In [45]:
#avg ratings
ratings = pd.DataFrame(dataset.groupby('product')['score'].mean()) 
#count ratings
ratings['rating_counts'] = pd.DataFrame(dataset.groupby('product')['score'].count())
ratings.sort_values(by=['score','rating_counts'], ascending=[False,False]).head()

Unnamed: 0_level_0,score,rating_counts
product,Unnamed: 1_level_1,Unnamed: 2_level_1
Motorola Smartphone Motorola Moto X Desbloqueado Preto Android 4.2.2 CÃ¢mera 10MP e Frontal 2MP MemÃ³ria Interna de 16GB GSM,10.0,141
Motorola Smartphone Motorola Moto G Dual Chip Desbloqueado TIM Android 4.3 Tela 4.5 8GB 3G Wi-Fi CÃ¢mera 5MP - Preto,10.0,128
Samsung Galaxy Note5,10.0,127
Samsung Smartphone Dual Chip Samsung Galaxy SIII Duos Desbloqueado Claro Azul Android 4.1 3G/Wi-Fi CÃ¢mera 5MP,10.0,121
Nokia Smartphone Nokia Lumia 520 Desbloqueado Oi Preto Windows Phone 8 CÃ¢mera 5MP 3G Wi-Fi MemÃ³ria Interna 8G GPS,10.0,119


## Build a collaborative filtering model using SVD 

In [46]:
cols = ['author','product','score']
rev_df = data2.reindex(columns=cols)
sampledf = rev_df.sample(n=5000, random_state=612)

In [47]:
ratings = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(sampledf,reader = ratings)

In [48]:
train = data.build_full_trainset()

In [49]:
train.ur

defaultdict(list,
            {0: [(0, 7.0)],
             1: [(1, 2.0)],
             2: [(2, 10.0)],
             3: [(3, 6.0)],
             4: [(4, 10.0), (3591, 10.0)],
             5: [(5, 8.0),
              (7, 6.0),
              (12, 6.0),
              (85, 10.0),
              (87, 8.0),
              (115, 6.0),
              (118, 10.0),
              (137, 10.0),
              (146, 2.0),
              (188, 4.0),
              (197, 2.0),
              (216, 10.0),
              (270, 10.0),
              (118, 8.0),
              (301, 2.0),
              (342, 2.0),
              (357, 4.0),
              (368, 8.0),
              (374, 2.0),
              (393, 2.0),
              (197, 8.0),
              (433, 10.0),
              (451, 2.0),
              (197, 2.0),
              (485, 10.0),
              (488, 10.0),
              (197, 10.0),
              (523, 2.0),
              (197, 10.0),
              (548, 10.0),
              (137, 2.0),
             

In [50]:
svd = SVD()
svd.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1e48d41ea88>

In [51]:
test = train.build_anti_testset()

In [52]:
preds = svd.test(test)
preds

[Prediction(uid='drprashams', iid='Apple iPhone 4S Verizon Cellphone, 16GB, White', r_ui=8.0768, est=7.546143346863421, details={'was_impossible': False}),
 Prediction(uid='drprashams', iid='Samsung Galaxy Note 3 Neo Smartphone (13,94 cm (5,49 Zoll) Super AMOLED-Touchscreen, 1,3 GHz Quad-Core-Prozessor, 8 Megapixel Kamera, Android 4.3) schwarz', r_ui=8.0768, est=7.936269327746195, details={'was_impossible': False}),
 Prediction(uid='drprashams', iid='Nokia 5030', r_ui=8.0768, est=7.802506064541559, details={'was_impossible': False}),
 Prediction(uid='drprashams', iid="Huawei P9 Plus Smartphone, LTE, Display 5.5'' FHD, 64 GB Memoria Interna, 4 GB RAM, Fotocamera 12 MP, Batteria 3400 mAh, Grigio", r_ui=8.0768, est=8.203464931533066, details={'was_impossible': False}),
 Prediction(uid='drprashams', iid='ZTE Z Max 2 16GB Unlocked GSM 4G LTE Quad-Core Android Smartphone w/ 8MP Camera - Black', r_ui=8.0768, est=8.185966407544337, details={'was_impossible': False}),
 Prediction(uid='drprasham

In [53]:
#recommendation of top 5 products for each user
topper = defaultdict(list)
for uid, iid, nos_true, est, _ in preds:
    topper[uid].append((iid, est))
    
for uid, ratings in topper.items():
    ratings.sort(key=lambda x: x[1], reverse=True)
    topper[uid] = ratings[:5]

In [54]:
topper

defaultdict(list,
            {'drprashams': [('Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, monoSIM, Android 5.0, Bianco [Italia]',
               8.845794111762599),
              ('Samsung Galaxy S7 32GB (Verizon)', 8.82527924796739),
              ('Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)',
               8.820030140398947),
              ('Nokia 6131', 8.70015355210577),
              ('Huawei Ascend P6', 8.68878879436581)],
             'Nickolas Gudmundson': [('Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)',
               8.542765755747185),
              ('Samsung Galaxy Express I8730', 8.414897204233128),
              ('Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, monoSIM, Android 5.0, Bianco [Italia]',
               8.40933617680253),
              ('Sams

In [55]:
#print recommendation for user's
for uid, ratings in topper.items():
    print(uid, [iid for (iid, _) in ratings])

drprashams ['Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, monoSIM, Android 5.0, Bianco [Italia]', 'Samsung Galaxy S7 32GB (Verizon)', 'Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)', 'Nokia 6131', 'Huawei Ascend P6']
Nickolas Gudmundson ['Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)', 'Samsung Galaxy Express I8730', 'Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, monoSIM, Android 5.0, Bianco [Italia]', 'Samsung Galaxy S5 16GB (Verizon)', 'Samsung Galaxy S4 GT-I9500 16GB (Ñ\x87ÐµÑ\x80Ð½Ñ\x8bÐ¹)']
BulldoZer ['LG GX200', 'Samsung Galaxy S6 64 GB UK SIM-Free Smartphone - Gold', 'Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, monoSIM, Android 5.0, Bianco [Italia]', 'Samsung Galaxy S7 edge Smartphone, 

Alexander Winter ['Samsung Galaxy S7 32GB (Verizon)', 'Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, monoSIM, Android 5.0, Bianco [Italia]', 'Samsung Galaxy S6 64 GB UK SIM-Free Smartphone - Gold', 'Samsung Galaxy S5 16GB (Verizon)', 'Huawei Ascend P6']
JOSÃ ANTONIO ['Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, monoSIM, Android 5.0, Bianco [Italia]', 'LG GX200', 'Samsung Galaxy S5 16GB (Verizon)', 'Samsung Galaxy S7 edge 32GB (T-Mobile)', 'Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)']
CatZapp ['Huawei Ascend P6', 'Samsung Galaxy Express I8730', 'LG GX200', 'Samsung Galaxy S6 64 GB UK SIM-Free Smartphone - Gold', 'Samsung Galaxy S7 edge 32GB (T-Mobile)']
JAFRACOSMETICOS ['Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, 

Melvin77 ['Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, monoSIM, Android 5.0, Bianco [Italia]', 'Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)', 'Samsung Galaxy S6 64 GB UK SIM-Free Smartphone - Gold', 'LG Optimus Exceed 2 (Verizon Prepaid) (Discontinued by Manufacturer)', 'LG GX200']
FRACOUTINHOALMEIDA ['Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)', 'Huawei Ascend P6', 'Samsung Galaxy S5 16GB (Verizon)', 'Samsung Galaxy S7 edge 32GB (AT&T)', 'LG GX200']
Vivek Rafaliya ['Samsung Galaxy S6 64 GB UK SIM-Free Smartphone - Gold', 'Huawei Ascend P6', 'Nokia 5800', 'Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)', 'Samsung Galaxy Express I8730']
peto71 ['Samsung Galaxy S6 64 GB UK SIM-Free Smartphone - Gold', 'Samsung Galaxy S5 16GB (Verizon)', 'ZTE Axon 7 Unlocked Smartphone,64GB Ion Gold (US Warranty)', 'Samsung Galaxy Express I873

Dietrich Ahrend ['Samsung Galaxy S7 edge 32GB (T-Mobile)', 'Samsung Galaxy S5 16GB (Verizon)', 'LG Optimus Exceed 2 (Verizon Prepaid) (Discontinued by Manufacturer)', 'Samsung Galaxy S7 32GB (Verizon)', 'Apple iPhone 3G 16Gb']
Cari ['Samsung Galaxy S6 64 GB UK SIM-Free Smartphone - Gold', 'Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)', 'Samsung Galaxy S7 edge 32GB (T-Mobile)', 'LG GX200', 'Samsung Galaxy S5 16GB (Verizon)']
lilsurfergirl ['Samsung Galaxy S5 16GB (Verizon)', 'Huawei Ascend P6', 'Samsung Galaxy S7 32GB (Verizon)', 'Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)', 'Samsung Galaxy S7 edge 32GB (T-Mobile)']
MARINAB2004 ['Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, monoSIM, Android 5.0, Bianco [Italia]', 'Samsung Galaxy S5 16GB (Verizon)', 'Smartphone Motorola Novo Moto G DTV Colors, Dual Chip, 3G, 16GB - XT1069', 'Samsung Galaxy S7 edge 32GB

teplyy ['Samsung Galaxy S5 16GB (Verizon)', 'Samsung Galaxy S6 64 GB UK SIM-Free Smartphone - Gold', 'Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)', 'Huawei P8 Lite Smartphone, Display 5" IPS, Processore Octa-Core 1.5 GHz, Memoria Interna da 16 GB, 2 GB RAM, Fotocamera 13 MP, monoSIM, Android 5.0, Bianco [Italia]', 'Huawei Honor 5X Unlocked Smartphone, 16GB Dark Grey (US Warranty)']
mirella ['Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)', 'Samsung Galaxy S7 32GB (Verizon)', 'Samsung Galaxy S7 edge 32GB (T-Mobile)', 'Huawei Ascend P6', 'Samsung Galaxy S5 16GB (Verizon)']
izzet ['Samsung Galaxy S5 16GB (Verizon)', 'Samsung Galaxy Express I8730', 'Samsung Galaxy S7 edge 32GB (T-Mobile)', 'Huawei Ascend P6', 'Samsung Galaxy S7 edge Smartphone, 13,9 cm (5,5 Zoll) Display, LTE (4G)']
dj_yeti ['Samsung Galaxy S5 16GB (Verizon)', 'Samsung Galaxy S6 64 GB UK SIM-Free Smartphone - Gold', 'Huawei Ascend P6', 'Huawei P8 Lite Smartphone, Display

## RMSE for Collaborative model of SVD 

In [57]:
#RMSE of SVD
accuracy.rmse(preds, verbose=True)

RMSE: 0.3311


0.3311180449452922

In [58]:
cross_validate(svd, data, measures=['RMSE'], cv=3, verbose=False)

{'test_rmse': array([2.59055552, 2.56286238, 2.52740383]),
 'fit_time': (0.1951451301574707, 0.20819520950317383, 0.2011566162109375),
 'test_time': (0.01559591293334961,
  0.008100509643554688,
  0.010134220123291016)}

In [59]:
#User based recommendations
collab = pd.DataFrame(preds, columns=['uid', 'iid', 'rui', 'est', 'details'])

In [60]:
def get_Iu(uid):
    try:
        return len(train.ur[train.to_inner_uid(uid)])
    except ValueError:
        return 0

In [61]:
collab['Iu'] = collab.uid.apply(get_Iu)

In [62]:
def get_Ui(iid):
    try: 
        return len(train.ir[train.to_inner_iid(iid)])
    except ValueError:
        return 0

In [63]:
collab['Ui'] = collab.iid.apply(get_Ui)

In [64]:
collab['err'] = abs(collab.est - collab.rui)

In [65]:
best = collab.sort_values(by='err')[:10]

In [66]:
best

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
11896488,Traxxas7,1100,8.0768,8.0768,{'was_impossible': False},1,1,3.020272e-08
15404956,P. BÃ¶ttcher,Huawei Honor 8 Unlocked Smartphone 32 GB Dual ...,8.0768,8.0768,{'was_impossible': False},1,2,4.582778e-08
5510377,Mr. J. S. Dare,×××¤×× ×¡××××¨× Apple iPhone SE 64GB S...,8.0768,8.0768,{'was_impossible': False},1,1,5.877269e-08
14223031,TheConsole,Samsung Galaxy S6 edge goud / 32 GB,8.0768,8.0768,{'was_impossible': False},1,1,6.580126e-08
7035448,JohnnyJohnJohn,Nokia 9110,8.0768,8.0768,{'was_impossible': False},1,1,2.292444e-07
8989147,Pokandi,Siemens CF75,8.0768,8.0768,{'was_impossible': False},1,1,2.370492e-07
3206113,Ð¡Ð¾Ð»Ð´Ð°ÑÐµÐ½ÐºÐ¾Ð² ÐÐ¸ÐºÑÐ¾Ñ,"LG Nexus 5X LG-H791 (16GB, Carbon)",8.0768,8.0768,{'was_impossible': False},1,1,2.425113e-07
15711136,Jazznj25,Apple iPhone 5C 32GB bianco,8.0768,8.0768,{'was_impossible': False},1,1,2.444823e-07
5323782,Iciar Burgos Iribarren,"Motorola Moto G 3rd Generation (Black, 16GB)",8.0768,8.0768,{'was_impossible': False},1,4,2.460611e-07
9538404,Nas??r,Apple iPhone 6s Plus 64GB 4G Oro,8.0768,8.0768,{'was_impossible': False},1,1,2.642508e-07


## KNN With Means 

### Item based 

In [67]:
rating = Reader(rating_scale=(1, 10))
knnData = Dataset.load_from_df(sampledf,reader = rating)

In [68]:
train_knn, testset_knn = train_test_split(knnData, test_size=.15)

In [69]:
knn = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
knn.fit(train_knn)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1e738bb0808>

In [72]:
knn_preds = knn.test(testset_knn)
knn_preds

[Prediction(uid='Cliente Amazon', iid='Asus ZenFone 2 Selfie Smartphone, 32 GB, Dual SIM, Aqua Blu [Italia]', r_ui=10.0, est=10, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='Pooh', iid='LG Muziq', r_ui=6.0, est=8.063764705882353, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='Yan Eva', iid='Nokia Asha 501 Dual Sim', r_ui=10.0, est=8.063764705882353, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='Ð\x90Ñ\x81Ñ\x82Ð°Ñ\x85Ð¾Ð² Ð\x9aÐ¸Ñ\x80Ð¸Ð»Ð»', iid='Alcatel PIXI 3(4.5) 4027D', r_ui=4.0, est=8.063764705882353, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='patriaconlon', iid='Alcatel One Touch Club', r_ui=8.0, est=8.063764705882353, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='MICHELLE', iid='Samsung Galaxy S5 Charcoal Black - Unlocked', r_ui=9.0, est=8.063764705882353, details={

In [74]:
#rmse
accuracy.rmse(knn_preds, verbose=True)

RMSE: 2.5133


2.5132541366999903

### User based 

In [76]:
ratings = Reader(rating_scale=(1, 10))
knndata_user = Dataset.load_from_df(sampledf,reader = ratings)

In [77]:
train_knnU, test_knnU = train_test_split(knndata_user, test_size=.15)

In [78]:
knnU = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
knnU.fit(train_knnU)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1e73804a9c8>

## Score prediction 

In [89]:
pred_knnU = knnU.predict('Amazon Customer', 'Lenovo Phab 2 Plus Smartphone (Grey, JBL earphones)', verbose=True)

user: Amazon Customer item: Lenovo Phab 2 Plus Smartphone (Grey, JBL earphones) r_ui = None   est = 7.50   {'actual_k': 5, 'was_impossible': False}


In [81]:
test_knn_U = knnU.test(test_knnU)
test_knn_U

[Prediction(uid='RICHARD', iid='Samsung Galaxy S5 Shimmery White - Unlocked', r_ui=8.0, est=8.115764705882352, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='Nikesh Jain', iid='Nokia Asha 305 - Dark Grey', r_ui=8.0, est=8.115764705882352, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='N. Stein', iid='Blackberry Q5 Smartphone (7,84 cm (3.1 Zoll) Display, QWERTZ-Tastatur, 5 MP Kamera, 8 GB interner Speicher, NFC, Blackberry 10.1 Betriebssystem) weiÃ\x9f', r_ui=10.0, est=8.115764705882352, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='BB', iid='BLU Tank II T193 Unlocked GSM Dual-SIM Cell Phone w/ Camera and 1900 mAh Big Battery - Unlocked Cell Phones - Retail Packaging - Black Yellow', r_ui=10.0, est=8.115764705882352, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'}),
 Prediction(uid='Soukki44', iid='Yousave Accessories Co

In [82]:
#rmse
accuracy.rmse(test_knn_U, verbose=True)

RMSE: 2.7249


2.724925222309068

In [86]:
cross_validate(knnU,knndata_user, measures=['RMSE'], cv=3, verbose=False)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([2.58966046, 2.64267714, 2.55421837]),
 'fit_time': (0.17769622802734375, 0.1802055835723877, 0.1625368595123291),
 'test_time': (0.018187761306762695, 0.0, 0.0)}

## Findings and inferences

Motorola Smartphone Motorola Moto X Desbloqueado Preto Android 4.2.2 CÃ¢mera 10MP e Frontal 2MP MemÃ³ria Interna de 16GB GSM is the most popular product.

Amazon Customer is the most active author who writes reviews.

Lenovo Vibe K4 Note (White,16GB) was rated by most of the authors.

## In what business scenario you should use popularity based Recommendation Systems 

Popularity based recommendation system relies on the popularity,trends and frequency counts of which items were most purchased. These are used for new customers or for users, to recommend them the most used, viewed or trending products.

## In what business scenario you should use CF based Recommendation Systems 

Collaborative Filtering is used to building intelligent recommender systems that can learn to give better recommendations as more information about users is collected. Recommendations are made based on the past behaviour of the user. Most websites like Netflix, Amazon prime, etc. use collaborative filtering as a recommendation system.

## Possible methods can further improve the recommendation for diﬀerent users

1. Content based recommendation systems
2. Geographic (country based or nationality based)
3. Demographic e.g. age, gender, etc.
4. Knowledge based (knowledge about users)