Data Science Assignment : Recommendation Systems

Dataset taken from: https://www.kaggle.com/datasets/CooperUnion/anime-recommendations-database

In [1]:
import pandas as pd
from math import sqrt
import numpy as np

I wanted to use the full rating dataset from Kaggle, but it was over 100MB, so I decided to slice the data down to around 5000 users.

I commented the code because I already saved the sliced dataset as a .csv file.

In [53]:
#for slicing the big dataset into a more manageable size
# ratings_df = pd.read_csv('rating.csv')
# ratings_df = ratings_df.iloc[0:491244, 0:3]
# ratings_df.to_csv("selected_user_anime_ratings.csv", index = False)
# ratings_df.tail()

Unnamed: 0,user_id,anime_id,rating
491239,5000,31043,9
491240,5000,31442,7
491241,5000,31637,8
491242,5000,31859,10
491243,5000,32245,8


In [91]:
#dropping unneeded data from the anime csv file
anime_df = pd.read_csv('anime.csv')
anime_df = anime_df.drop(["type", "episodes", "rating", "members"], 1)
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   anime_id  12294 non-null  int64 
 1   name      12294 non-null  object
 2   genre     12232 non-null  object
dtypes: int64(1), object(2)
memory usage: 288.3+ KB


  anime_df = anime_df.drop(["type", "episodes", "rating", "members"], 1)


In [92]:
ratings_df = pd.read_csv('selected_user_anime_ratings.csv')
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491244 entries, 0 to 491243
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   491244 non-null  int64
 1   anime_id  491244 non-null  int64
 2   rating    491244 non-null  int64
dtypes: int64(3)
memory usage: 11.2 MB


In [123]:
anime_df.isnull().any()

anime_id    False
name        False
genre       False
dtype: bool

In [121]:
anime_df.dropna(subset= ["genre"], inplace= True)

In [124]:
anime_df.isnull().any()

anime_id    False
name        False
genre       False
dtype: bool

In [125]:
ratings_df.isnull().values.any()

False

In [95]:
userInput = [{'name':'Fullmetal Alchemist: Brotherhood', 'rating':9},
             {'name':'Mushishi', 'rating': 9},
             {'name':'Mob Psycho 100', 'rating':8},
             {'name':'Boku no Hero Academia', 'rating':5},
             {'name':'Acchi Kocchi (TV)', 'rating':4}]
inputAnime = pd.DataFrame(userInput)
print(inputAnime)

                               name  rating
0  Fullmetal Alchemist: Brotherhood       9
1                          Mushishi       9
2                    Mob Psycho 100       8
3             Boku no Hero Academia       5
4                 Acchi Kocchi (TV)       4


In [97]:
#dropping unneeded columns
inputId = anime_df[anime_df['name'].isin(inputAnime['name'].tolist())]
inputAnime = pd.merge(inputId, inputAnime)
inputAnime = inputAnime.drop("genre", 1)
inputAnime = inputAnime[['anime_id','name','rating']]
print(inputAnime)

   anime_id                              name  rating
0      5114  Fullmetal Alchemist: Brotherhood       9
1       457                          Mushishi       9
2     32182                    Mob Psycho 100       8
3     31964             Boku no Hero Academia       5
4     12291                 Acchi Kocchi (TV)       4


  inputAnime = inputAnime.drop("genre", 1)


In [98]:
userSubset = ratings_df[ratings_df['anime_id'].isin(inputAnime['anime_id'].tolist())]
print(userSubset.groupby('anime_id').count())

          user_id  rating
anime_id                 
457           405     405
5114         1849    1849
12291         302     302
31964         502     502
32182         277     277


In [99]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['user_id'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(492,        user_id  anime_id  rating
46098      492       457       8
46135      492      5114       7
46227      492     12291       8
46450      492     31964       8
46453      492     32182       8), (1176,         user_id  anime_id  rating
114655     1176       457       9
114768     1176      5114       9
114912     1176     12291       7
115228     1176     31964       8
115232     1176     32182       8), (1344,         user_id  anime_id  rating
133036     1344       457       8
133348     1344      5114      10
133682     1344     12291       6
134148     1344     31964       9
134157     1344     32182       8), (1504,         user_id  anime_id  rating
149250     1504       457      10
149382     1504      5114      10
149535     1504     12291       7
149771     1504     31964       9
149774     1504     32182       9), (1549,         user_id  anime_id  rating
155436     1549       457      10
155489     1549      5114      10
155568     1549     12291      10
155890     

  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [100]:
pearsonCorrelationDict = {}
for name, group in userSubsetGroup:

    group = group.sort_values(by='anime_id')
    inputAnime = inputAnime.sort_values(by='anime_id')

    nRatings = len(group)

    temp_df = inputAnime[inputAnime['anime_id'].isin(group['anime_id'].tolist())]

    tempRatingList = temp_df['rating'].tolist()

    tempGroupList = group['rating'].tolist()
   
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [101]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarity_index']
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarity_index  user_id
0         -0.476731      492
1          0.891883     1176
2          0.574960     1344
3          0.870388     1504
4         -0.238366     1549


In [102]:
topUsers=pearsonDF.sort_values(by='similarity_index', ascending=False)[0:50]
print(topUsers.head())

    similarity_index  user_id
76          1.000000      530
9           0.996116      562
78          0.989743      578
66          0.981981      244
74          0.981981      444


In [103]:
topUsersRating=topUsers.merge(ratings_df, left_on='user_id', right_on='user_id', how='inner')
print(topUsersRating.head(100))

    similarity_index  user_id  anime_id  rating
0                1.0      530         1       8
1                1.0      530        25       9
2                1.0      530        30      10
3                1.0      530        32      10
4                1.0      530        43      10
..               ...      ...       ...     ...
95               1.0      530      4654      10
96               1.0      530      4672      10
97               1.0      530      4715       9
98               1.0      530      4896       7
99               1.0      530      4938      10

[100 rows x 4 columns]


In [104]:
topUsersRating['weighted_rating'] = topUsersRating['similarity_index']*topUsersRating['rating']
print(topUsersRating.head())

   similarity_index  user_id  anime_id  rating  weighted_rating
0               1.0      530         1       8              8.0
1               1.0      530        25       9              9.0
2               1.0      530        30      10             10.0
3               1.0      530        32      10             10.0
4               1.0      530        43      10             10.0


In [105]:
tempTopUsersRating = topUsersRating.groupby('user_id').sum()[['similarity_index','weighted_rating']]
tempTopUsersRating.columns = ['sum_similarity_index','sum_weighted_rating']
print(tempTopUsersRating.head())

         sum_similarity_index  sum_weighted_rating
user_id                                           
17                 577.422123          2512.410733
38                 245.593512          1646.350182
123                322.419489          2274.964180
244                272.008600          2374.428864
271                326.278163          2405.416306


In [106]:
recommendation_df = pd.DataFrame()

recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weighted_rating']/tempTopUsersRating['sum_similarity_index']
recommendation_df['anime_id'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  anime_id
user_id                                                 
17                                    4.351082        17
38                                    6.703557        38
123                                   7.055914       123
244                                   8.729242       244
271                                   7.372287       271
294                                   7.922963       294
321                                   8.073090       321
444                                   7.611511       444
455                                   7.070175       455
530                                   8.301829       530


In [107]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  anime_id
user_id                                                 
1058                                  9.081013      1058
3657                                  8.856476      3657
244                                   8.729242       244
825                                   8.579882       825
680                                   8.564706       680
1753                                  8.541463      1753
530                                   8.301829       530
904                                   8.273585       904
3169                                  8.125000      3169
1504                                  8.123023      1504
3981                                  8.095541      3981
1015                                  8.081967      1015
321                                   8.073090       321
4740                                  8.000000      4740
963                                   7.974638       963
294                            

In [111]:
recommended_anime=anime_df.loc[anime_df['anime_id'].isin(recommendation_df['anime_id'])]

#we don't want to recommend the same movie
recommended_anime=recommended_anime.loc[~recommended_anime.anime_id.isin(userSubset['anime_id'])]

recommended_anime

Unnamed: 0,anime_id,name,genre
71,578,Hotaru no Haka,"Drama, Historical"
142,4081,Natsume Yuujinchou,"Drama, Fantasy, Shoujo, Slice of Life, Superna..."
614,2273,Mobile Suit Gundam Wing: Endless Waltz Movie,"Action, Drama, Mecha, Military, Sci-Fi, Space"
766,444,Maria-sama ga Miteru: Haru,"Drama, Romance, School, Shoujo, Shoujo Ai"
886,1015,Full Metal Panic! The Second Raid: Wari to Him...,Comedy
912,123,Fushigi Yuugi,"Adventure, Comedy, Drama, Fantasy, Historical,..."
976,17,Hungry Heart: Wild Striker,"Comedy, Shounen, Slice of Life, Sports"
1028,530,Bishoujo Senshi Sailor Moon,"Demons, Magic, Romance, Shoujo"
1114,455,Fantastic Children,"Adventure, Fantasy, Mystery, Romance, Sci-Fi"
1463,1327,Aoki Densetsu Shoot!,"Action, Comedy, Drama, Romance, School, Shoune..."
