# Making Recommendations Based on Popularity

In [None]:
# import the dataframes and explore the data
# introduce the hybrid metrics

## Preliminary data exploration

In [None]:
import pandas as pd

titles = ['links','movies','ratings','tags']
path_csv = lambda title: f'/Users/G/WBS Bootcamp/8. Recommender Systems/Data/{title}.csv'

links = pd.read_csv(path_csv(titles[0]))
movies = pd.read_csv(path_csv(titles[1]))
ratings = pd.read_csv(path_csv(titles[2]))
tags = pd.read_csv(path_csv(titles[3]))

### Dataframes and Features description

* `links.csv`: Identifiers that can be used to link to other sources of movie data. Each line of this file after the header row represents one movie
    * `imdbId` is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>.

    * `tmdbId` is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>.

* `ratings.csv`: Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).

* `tags.csv`: Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user.

* `Timestamps`: represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.



There are no data to impute nor to convert in appropriate datatype.

## Rating

In [None]:
#introduce the average rating and the rating count
popularity = ratings[['movieId','rating']].groupby(by='movieId').agg(avg_rating=("rating","mean"))
popularity['rating_count'] = ratings[['movieId','rating']].groupby(by='movieId').agg(rating_count=("rating","count"))['rating_count']




In [None]:
#ordering by avg_rating
popularity.sort_values(by='avg_rating',ascending = False).head()

Unnamed: 0_level_0,avg_rating,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
88448,5.0,1
100556,5.0,1
143031,5.0,1
143511,5.0,1
143559,5.0,1


In [None]:
#ordering by counts
popularity.sort_values(by='rating_count',ascending = False).head()

Unnamed: 0_level_0,avg_rating,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
356,4.164134,329
318,4.429022,317
296,4.197068,307
593,4.16129,279
2571,4.192446,278


* Weighted average
$$ w_i = \frac{ c_i \cdot r_i}{\sum_i c_i} $$
where $w_i$ is the new hybrid measure, $c_i$ and $r_i$ the counts and rating of the $i$-th system.

* Linear combination: we assign different weight to counts and ratings and then sum

$$ \ell_i = a c_i + b r_i$$

In [None]:
def weight_hybrid(n,df):
    
    #this function adds a new column with the weights and returns the "heaviest" n resturants
    
    df2 = df.copy() 
    df2['weight'] = (df['rating_count'] * df['avg_rating']) / (df['rating_count'].sum())
    
    return df2.sort_values(by="weight", ascending = False).head(n)

weight_hybrid(10,popularity)

Unnamed: 0_level_0,avg_rating,rating_count,weight
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
318,4.429022,317,0.013924
356,4.164134,329,0.013586
296,4.197068,307,0.012778
2571,4.192446,278,0.011558
593,4.16129,279,0.011514
260,4.231076,251,0.010532
110,4.031646,237,0.009476
2959,4.272936,218,0.009238
527,4.225,220,0.009218
480,3.75,238,0.008851


In [13]:
def linear_hybrid(n, df, weight_counts):
    #This function linearly combines ratings and counts with appropriate weights
    
    #Error message
    if weight_counts < 0 or weight_counts > 1:
        print("Weight must be in [0, 1]")
    
    #Scaling of the data
    from sklearn.preprocessing import MinMaxScaler
    my_scaler = MinMaxScaler().set_output(transform="pandas")
    my_scaler.fit(df)
    df1 = my_scaler.transform(df)
    
    
    col_name = f"lin. {weight_counts*100}%"
    df1[col_name] = weight_counts * df1['rating_count'] + (1 - weight_counts) * df1['avg_rating']
    
    return df1.sort_values(by=col_name, ascending=False).head(n)
linear_hybrid(10,popularity, 0.7)

Unnamed: 0_level_0,avg_rating,rating_count,lin. 70.0%
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,0.814252,1.0,0.944276
318,0.873116,0.963415,0.936325
296,0.821571,0.932927,0.89952
593,0.81362,0.847561,0.837379
2571,0.820544,0.844512,0.837322
260,0.829128,0.762195,0.782275
110,0.78481,0.719512,0.739102
480,0.722222,0.722561,0.722459
527,0.827778,0.667683,0.715711
2959,0.83843,0.661585,0.714639
