# String Content Based Recommender System


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
songs=pd.read_csv('music.csv')
songs.head()

Unnamed: 0,title,genres,year
0,I Didn't Mean To,Jazz,1997
1,Soul Deep,Jazz|Hate|Rock,1969
2,Amor De Cabaret,Folk|Romance,1997
3,Something Girls,Folk|Classic|Romance,1982
4,Face the Ashes,Folk,2007


In [3]:
songs.shape

(9643, 3)

In [4]:
songs.isna().sum()

title     0
genres    0
year      0
dtype: int64

In [5]:
import warnings
warnings.filterwarnings('ignore')
songs['genres']=songs['genres'].str.replace('|', ' ')

In [6]:
songs.head(5)

Unnamed: 0,title,genres,year
0,I Didn't Mean To,Jazz,1997
1,Soul Deep,Jazz Hate Rock,1969
2,Amor De Cabaret,Folk Romance,1997
3,Something Girls,Folk Classic Romance,1982
4,Face the Ashes,Folk,2007


# Term Frequency and Inverse Document Frequency Matrix

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tfidf_vector=TfidfVectorizer(stop_words='english')
tfidf_matrix=tfidf_vector.fit_transform(songs['genres'])
print(tfidf_matrix[:5])

  (0, 12)	1.0
  (1, 17)	0.5940761164298742
  (1, 9)	0.6233790368007639
  (1, 12)	0.5084015581850143
  (2, 18)	0.819170276063785
  (2, 7)	0.5735503977974231
  (3, 3)	0.46583696216345366
  (3, 18)	0.724859668658722
  (3, 7)	0.5075178671084329
  (4, 7)	1.0


In [11]:
print(tfidf_vector.get_feature_names())

['art', 'base', 'bass', 'classic', 'comb', 'documentary', 'film', 'folk', 'genres', 'hate', 'heavymetal', 'imax', 'jazz', 'listed', 'love', 'musical', 'rap', 'rock', 'romance', 'solo', 'war', 'western']


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
cosine_similarity(tfidf_matrix)

array([[1.        , 0.50840156, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.50840156, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.5735504 ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.5735504 , ..., 0.        , 0.        ,
        1.        ]])

In [13]:
songs.head()

Unnamed: 0,title,genres,year
0,I Didn't Mean To,Jazz,1997
1,Soul Deep,Jazz Hate Rock,1969
2,Amor De Cabaret,Folk Romance,1997
3,Something Girls,Folk Classic Romance,1982
4,Face the Ashes,Folk,2007


## Read more about fuzzy string matching here
https://marcobonzanini.com/2015/02/25/fuzzy-string-matching-in-python/

https://towardsdatascience.com/fuzzy-string-matching-algorithms-e0d483c2a9ea

https://en.wikipedia.org/wiki/Levenshtein_distance

In [None]:
!pip install fuzzywuzzy

In [16]:
from fuzzywuzzy import fuzz
fuzz.ratio("songs","music")

20

In [20]:
searchterm = "Iowa"

In [21]:
# Calculate fuzzy scores.
fuzzy_scores = list(fuzz.ratio(songs['title'][i], searchterm) for i in range(songs.shape[0]))
songs["song_scores"] = fuzzy_scores
songs.head()

Unnamed: 0,title,genres,year,song_scores
0,I Didn't Mean To,Jazz,1997,20
1,Soul Deep,Jazz Hate Rock,1969,15
2,Amor De Cabaret,Folk Romance,1997,21
3,Something Girls,Folk Classic Romance,1982,11
4,Face the Ashes,Folk,2007,11


In [22]:
recommended = list(songs.nlargest(10, 'song_scores' ).title)
print("Here is the list of songs similar to "  + searchterm)
for item in recommended:
    print(item)

Here is the list of songs similar to Iowa
Rowena
Wow
Isolate
Chowbay
Into A Swan
Snowball
Loba
Iron Man
I'm Down
I Know A Man


In [None]:
# Try putting above together as a function to recommend songs.

# Import Libraries

In [None]:
import os
import time
import math
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz

import matplotlib.pyplot as plt


%matplotlib inline

# Item Based Collaborative Filtering

# Load Dataset

In [23]:
booksratings = pd.read_csv('books_ratings.csv',
                           sep=';', 
                           error_bad_lines=False, 
                           encoding="latin-1")
booksratings.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [24]:
booksratings.shape

(1149780, 3)

In [25]:
users = pd.read_csv('users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
users.head(3)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [26]:
users.shape

(278858, 3)

In [27]:
books = pd.read_csv('books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
books.head(3)

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...


In [28]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [29]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [30]:
booksratings.isna().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [31]:
booksratings.columns = ["userid", "isbn", "rating" ]
booksratings.head()

Unnamed: 0,userid,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [32]:
booksratings.shape

(1149780, 3)

In [33]:
exampledf = booksratings.sample(n=8)
utility_matrix = pd.pivot_table(exampledf, values='rating', index='userid', columns='isbn')
utility_matrix

isbn,0066211611,0140049401,0345300785,0486284735,075640021X,1551660571,2266092766,8478770194
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
32818,,0.0,,,,,,
53628,,,,,,,0.0,
78783,,,,,,0.0,,
98391,,,,,8.0,,,
153621,,,,,,,,0.0
204864,0.0,,,,,,,
228998,,,0.0,,,,,
233747,,,,10.0,,,,


In [34]:
exampledf.shape

(8, 3)

In [None]:
#frequent_users = list(idcounts['userid'])
#ratingdf = booksratings[booksratings["userid"].isin(frequent_users)]                              

In [35]:
len(list(set(list(booksratings.userid))))

105283

![](utilitymattrix.png)

In [36]:
counted = booksratings.userid.value_counts().to_frame().reset_index()
counted.columns = ["userid", "number_of_userids"]
mostfrequent = list(counted[counted.number_of_userids> 200].userid)
len(mostfrequent)

899

In [37]:
booksratings.shape

(1149780, 3)

In [38]:
filtered_booksratings = booksratings[booksratings["userid"].isin(mostfrequent)]
print(filtered_booksratings.shape)

(526356, 3)


In [39]:
newdf = filtered_booksratings 

## SurPRISE: Simple Python Recommender System Engine
## https://surpriselib.com/
##  !pip install scikit-surprise

In [40]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [41]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(newdf[['userid', 'isbn', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)
rs_svd = SVD(n_epochs=20, lr_all=0.005, reg_all=0.2)
rs_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x122806e80>

In [42]:
predictions = rs_svd.test(testset)
accuracy.rmse(predictions)

RMSE: 3.0965


3.096501361107497

In [43]:
from surprise.model_selection import cross_validate
cross_validate(rs_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.0956  3.0941  3.1033  3.1108  3.1097  3.1027  0.0069  
MAE (testset)     2.3471  2.3432  2.3535  2.3597  2.3552  2.3517  0.0059  
Fit time          4.71    4.88    4.79    4.11    4.44    4.59    0.28    
Test time         0.51    0.31    0.53    0.32    0.36    0.41    0.09    


{'test_rmse': array([3.09562115, 3.09406554, 3.10327972, 3.11078853, 3.10966801]),
 'test_mae': array([2.34706287, 2.34320361, 2.35345829, 2.3597373 , 2.35515134]),
 'fit_time': (4.713491916656494,
  4.8816750049591064,
  4.792021036148071,
  4.112600803375244,
  4.443594932556152),
 'test_time': (0.5094010829925537,
  0.3143491744995117,
  0.5255200862884521,
  0.3161332607269287,
  0.36098217964172363)}

In [44]:
testset[:10]

[(162738, '044017158X', 0.0),
 (29259, '0449006565', 0.0),
 (6251, '0399138250', 0.0),
 (170575, '0061097101', 0.0),
 (148744, '0553583441', 0.0),
 (69971, '039480001X', 0.0),
 (113519, '157324175X', 0.0),
 (222941, '0373825021', 0.0),
 (135149, '1573221112', 10.0),
 (75860, '0060083298', 0.0)]

In [45]:
testdf = pd.DataFrame(testset, columns=['userid', 'isbn', 'rating'])
test_umatrix = testdf.pivot(index='userid', columns='isbn', values='rating')
pd.DataFrame(test_umatrix).head()

isbn,0586045007,9022906116,9032803328,904492401X,9061002273,+0451197399,0 00 612183 7,000000000,0000000029841,0000000033220,...,N3612250612,O38080560X,O385509510,O39428178O,O451203771,O499139921,O59052528X,T110105705048,THEALLTRUETRA,"\2842053052\"""""
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,,,,,,,,,,,...,,,,,,,,,,
2276,,,,,,,,,,,...,,,,,,,,,,
2766,,,,,,,,,,,...,,,,,,,,,,
2977,,,,,,,,,,,...,,,,,,,,,,
3363,,,,,,,,,,,...,,,,,,,,,,


In [46]:
predictions[0]

Prediction(uid=162738, iid='044017158X', r_ui=0.0, est=1.3330621613089382, details={'was_impossible': False})

In [47]:
booksratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   userid  1149780 non-null  int64 
 1   isbn    1149780 non-null  object
 2   rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [48]:
userid = 254
isbn = "9044922564"
prediction = rs_svd.predict(userid, isbn)
print(prediction.est)

1.3858291905654019


In [49]:
isbn = list(pd.DataFrame(test_umatrix).columns)
len(isbn)

67028

In [50]:
predictions254 = [rs_svd.predict(userid, x).est for x in isbn]

In [51]:
predictions254[:10]

[0.7906689713878758,
 1.0316772647643795,
 0.942993390665936,
 1.3858291905654019,
 0.9673644588393557,
 0.9826829998038764,
 1.6542226922941787,
 1.3500320300865172,
 1.4665577973581967,
 1.2084598748174338]

In [53]:
predictions254[67020:]

[1.301396627228824,
 1.9586956311607586,
 1.3858291905654019,
 1.3858291905654019,
 1.3845866017998687,
 1.4184086680202224,
 1.5707778130971333,
 1.3630235306301497]