### Data Mining and Machine Learning
### Content-based Recommendation Systems
#### Edgar Acuna
#### Dataset Movies: 2500 movies, 862 users and 94875 Tags
#### Mayo 2021

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import math

In [2]:
#reading the data
Ratings=pd.read_csv('https://academic.uprm.edu/eacuna/ratings.csv',encoding='latin-1')
Movies=pd.read_csv('https://academic.uprm.edu/eacuna/movies.csv',encoding='latin-1')
Tags=pd.read_csv('https://academic.uprm.edu/eacuna/tags.csv',encoding='latin-1')
Tags.head()

Unnamed: 0,movieId,userId,tag,timestamp
0,3916,12882,sports,1147195545
1,4085,12882,Eddie Murphy,1147195966
2,33660,12882,boxing,1147195514
3,1197,320,must show,1145964801
4,1396,320,must show,1145964810


### Calculating the TF (Term frequency) value and IDF (Inverse document frequency) value and multiplying together to get TF-IDF value

In [3]:
TF= Tags.groupby(['movieId','tag'], as_index = False, sort = False).count().rename(columns = {'userId': 'tag_count_TF'})[['movieId','tag','tag_count_TF']]
Tag_distinct = Tags[['tag','movieId']].drop_duplicates()
DF =Tag_distinct.groupby(['tag'], as_index = False, sort = False).count().rename(columns = {'movieId': 'tag_count_DF'})[['tag','tag_count_DF']]
a=math.log10(len(np.unique(Tags['movieId'])))
DF['IDF']=a-np.log10(DF['tag_count_DF'])
#print(DF)
TF = pd.merge(TF,DF,on = 'tag', how = 'left', sort = False)
TF['TF-IDF']=TF['tag_count_TF']*TF['IDF']
TF.tail()

Unnamed: 0,movieId,tag,tag_count_TF,tag_count_DF,IDF,TF-IDF
55101,6942,funny,1,160,1.192951,1.192951
55102,6942,Nudity (Topless - Notable),1,46,1.734313,1.734313
55103,6942,Nudity (Topless),1,123,1.307165,1.307165
55104,6947,death of child,1,2,3.096041,3.096041
55105,33679,Strong Women,1,6,2.618919,2.618919


### Calculating the unit length vector by dividing TF-IDF value with the vector length of a particular movie.

In [4]:
Vect_len=TF[['movieId','TF-IDF']]
Vect_len['TF-IDF-Sq']=Vect_len['TF-IDF']**2
Vect_len =Vect_len.groupby(['movieId'], as_index = False, sort = False).sum().rename(columns = {'TF-IDF-Sq': 'TF-IDF-Sq-sum'})[['movieId','TF-IDF-Sq-sum']]
Vect_len['vect_len'] = np.sqrt(Vect_len[['TF-IDF-Sq-sum']].sum(axis=1))
TF = pd.merge(TF,Vect_len,on = 'movieId', how = 'left', sort = False)
TF['TAG_WT']=TF['TF-IDF']/TF['vect_len']
TF.head()

Unnamed: 0,movieId,tag,tag_count_TF,tag_count_DF,IDF,TF-IDF,TF-IDF-Sq-sum,vect_len,TAG_WT
0,3916,sports,3,54,1.664677,4.99403,357.772839,18.914884,0.264026
1,4085,Eddie Murphy,7,14,2.250943,15.756598,731.448077,27.045297,0.5826
2,33660,boxing,9,18,2.141798,19.276182,1308.849619,36.178027,0.532815
3,1197,must show,1,5,2.698101,2.698101,2755.122656,52.489262,0.051403
4,1396,must show,1,5,2.698101,2.698101,578.758692,24.057404,0.112153


#### Calculating the user profile should be the sum of the item-tag vectors of all items the user has 
#### rated positively (>= 3.5 stars). 

In [5]:
Ratings_filter=Ratings[Ratings['rating']>=3.5]
distinct_users=np.unique(Ratings['userId'])
user_tag_pref=pd.DataFrame()
i=1
for user in distinct_users[1:2]:
    if i%30==0:
        print('user: ', i , 'out of: ', len(distinct_users))
    user_data= Ratings_filter[Ratings_filter['userId']==user]
    user_data = pd.merge(TF,user_data,on = 'movieId', how = 'inner', sort = False)
    user_data1 = user_data.groupby(['tag'], as_index = False, sort = False).sum().rename(columns = {'TAG_WT': 'tag_pref'})[['tag','tag_pref']]
    user_data1['user']=user
    user_tag_pref = user_tag_pref.append(user_data1, ignore_index=True)
i=i+1
user_tag_pref.head()

Unnamed: 0,tag,tag_pref,user
0,must show,0.39623,320
1,based on a book,0.733053,320
2,Cary Elwes,0.159748,320
3,classic,0.878186,320
4,fairy tale,0.371488,320


In [12]:
distinct_users=np.unique(Ratings_filter['userId'])
tag_merge_all=pd.DataFrame()
i=1
for user in distinct_users[1:2]:
    user_tag_pref_all= user_tag_pref[user_tag_pref['user']==user]
    distinct_movies = np.unique(TF['movieId'])
    j=1
    for movie in distinct_movies:
        if j%300==0:
            print("movie:" , j , "out of: ", len(distinct_movies) , "with user: ", i , "out of: ", len(distinct_users))
        TF_Movie= TF[TF['movieId']==movie]
        tag_merge = pd.merge(TF_Movie,user_tag_pref_all,on = 'tag', how = 'left', sort = False)
        tag_merge['tag_pref']=tag_merge['tag_pref'].fillna(0)
        tag_merge['tag_value']=tag_merge['TAG_WT']*tag_merge['tag_pref']
        TAG_WT_val=np.sqrt(np.sum(np.square(tag_merge['TAG_WT']), axis=0))
        tag_pref_val=np.sqrt(np.sum(np.square(user_tag_pref_all['tag_pref']), axis=0))
        tag_merge_final = tag_merge.groupby(['user','movieId'])[['tag_value']].sum().rename(columns = {'tag_value': 'Rating'}).reset_index()
        tag_merge_final['Rating']=tag_merge_final['Rating']/(TAG_WT_val*tag_pref_val)
        tag_merge_all = tag_merge_all.append(tag_merge_final, ignore_index=True)
    j=j+1
i=i+1
tag_merge_all=tag_merge_all.sort_index().reset_index()

In [13]:
tag_merge_all.head()

Unnamed: 0,index,user,movieId,Rating
0,0,320.0,1,0.12605
1,1,320.0,2,0.062965
2,2,320.0,3,0.00797
3,3,320.0,4,0.026146
4,4,320.0,5,0.014172
