## 1 Preliminary data exploration

In [2]:
import pandas as pd


titles = ['links','movies','ratings','tags']
path_csv = lambda title: f'/Users/G/Desktop/Documents/Formazione in Data Science/WBS/WBS Bootcamp/8. Recommender Systems/Data/{title}.csv'

links = pd.read_csv(path_csv(titles[0]))
movies = pd.read_csv(path_csv(titles[1]))
ratings = pd.read_csv(path_csv(titles[2]))
tags = pd.read_csv(path_csv(titles[3]))

In [3]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


### 1.1 Dataframes and Features description

* `links.csv`: Identifiers that can be used to link to other sources of movie data. Each line of this file after the header row represents one movie
    * `imdbId` is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>.

    * `tmdbId` is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>.

* `ratings.csv`: Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).

* `tags.csv`: Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user.

* `Timestamps`: represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.



There are no data to impute nor to convert in appropriate datatype.

## 2 Making Recommendations Based on Popularity
A popularity-based, non-personalised recommender system that takes as an input the ratings and movies datasets and outputs the “best” movies. How you define “best” is up to you. Those movies will appear as the top row of the WBSFLIX site.

In [4]:
#introduce the average rating and the rating count
popularity = ratings[['movieId','rating']].groupby(by='movieId').agg(avg_rating=("rating","mean"))
popularity['rating_count'] = ratings[['movieId','rating']].groupby(by='movieId').agg(rating_count=("rating","count"))['rating_count']

In [5]:
#ordering by avg_rating
popularity.sort_values(by='avg_rating',ascending = False).head(2)

Unnamed: 0_level_0,avg_rating,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
88448,5.0,1
100556,5.0,1


In [6]:
#ordering by counts
popularity.sort_values(by='rating_count',ascending = False).head(2)

Unnamed: 0_level_0,avg_rating,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
356,4.164134,329
318,4.429022,317


### 2.1 Introducing hybrid metrics

* Weighted average
$$ w_i = \frac{ c_i \cdot r_i}{\sum_i c_i} $$
where $w_i$ is the new hybrid measure, $c_i$ and $r_i$ the counts and rating of the $i$-th system.

* Linear combination: we assign different weight to counts and ratings and then sum

$$ \ell_i = a c_i + b r_i$$

In [7]:
def weight_hybrid(n,df):
    
    #this function adds a new column with the weights and returns the "heaviest" n resturants
    
    df2 = df.copy() 
    df2['weight'] = (df['rating_count'] * df['avg_rating']) / (df['rating_count'].sum())
    
    return df2.sort_values(by="weight", ascending = False).head(n)

#weight_hybrid(10,popularity)

In [8]:
def linear_hybrid(n, df, weight_counts):
    #This function linearly combines ratings and counts with appropriate weights
    
    #Error message
    if weight_counts < 0 or weight_counts > 1:
        print("Weight must be in [0, 1]")
    
    #Scaling of the data
    from sklearn.preprocessing import MinMaxScaler
    my_scaler = MinMaxScaler().set_output(transform="pandas")
    my_scaler.fit(df)
    df1 = my_scaler.transform(df)
    
    
    col_name = f"lin. {weight_counts*100}%"
    df1[col_name] = weight_counts * df1['rating_count'] + (1 - weight_counts) * df1['avg_rating']
    
    return df1.sort_values(by=col_name, ascending=False).head(n)
#linear_hybrid(10,popularity, 0.7)

In [9]:
linear_hybrid(10,ratings,0.7)

KeyError: 'rating_count'

In [10]:
def fun_popularity(n, df, weight_counts):
    #This function computes the most popular movies based on linear combination method
    #This function is an upgrade of linear_hybrid() as it also manipulates the original dataframe
    
    #introduce the average rating and the rating count
    popularity = df[['movieId','rating']].groupby(by='movieId').agg(avg_rating=("rating","mean"))
    popularity['rating_count'] = df[['movieId','rating']].groupby(by='movieId').agg(rating_count=("rating","count"))['rating_count']
    
    
    #Scaling of the data
    from sklearn.preprocessing import MinMaxScaler
    my_scaler = MinMaxScaler().set_output(transform="pandas")
    my_scaler.fit(popularity)
    df1 = my_scaler.transform(popularity)
    
    
    col_name = f"lin. {weight_counts*100}%"
    df1[col_name] = weight_counts * df1['rating_count'] + (1 - weight_counts) * df1['avg_rating']
    return df1.sort_values(by=col_name, ascending=False).head(n)


In [11]:
fun_popularity(10,ratings,0.7)

Unnamed: 0_level_0,avg_rating,rating_count,lin. 70.0%
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,0.814252,1.0,0.944276
318,0.873116,0.963415,0.936325
296,0.821571,0.932927,0.89952
593,0.81362,0.847561,0.837379
2571,0.820544,0.844512,0.837322
260,0.829128,0.762195,0.782275
110,0.78481,0.719512,0.739102
480,0.722222,0.722561,0.722459
527,0.827778,0.667683,0.715711
2959,0.83843,0.661585,0.714639


## 3. Making Recommendations Based on Correlation

### 3.1 Item-based collaborative filtering

A similarity-based, semi-personalised recommender system that takes a movie as an input – when put into production, it will be a movie that the user has watched recently or rated highly, for now, it’s a manually inputted movie – and then outputs a list of movies that are “similar” to the one inputted based on rating correlations from the user-item matrix. Those movies will appear as the second row of the WBSFLIX site.

* Create a pivot table userId VS movieId for ratings
* Pick up one movieId and calculate the Similarity with the others
* Sort the data

In [None]:
ratings_pivot = pd.pivot_table(data = ratings, values='rating', index='userId', columns='movieId')

#### 3.1.1 Similarities for a specific movie

Based on the previous analysis (linear method) we know that the most popular movie has `movieId=356` (Forrest Gump (1994)).
We calculate the correlations with the method `.coorwith()`

In [None]:
ratings_ForrestGump = ratings_pivot[356]
similar_to_ForrestGump = ratings_pivot.corrwith(ratings_ForrestGump)

We get wornings due to the NaNs

In [None]:
#create a pandas dataframe
corr_ForrestGump = pd.DataFrame(similar_to_ForrestGump, columns = ['Pearson'])
len0 = len(corr_ForrestGump)
#drop the NaNs
corr_ForrestGump.dropna(inplace = True)
print(f'# of rows before and after dropping NaNs: {len0} -> {len(corr_ForrestGump)}\n\n')
corr_ForrestGump.sample(5)

In [None]:
#Now we wanna construct a dataframe of the form (movies) VS (Pearson, popularity_metric)
#Notice: we use the previoiusly introduced function linear_hybrid()

mixed_ForrestGump = linear_hybrid(len(popularity),popularity, 0.7)[['lin. 70.0%']].join(corr_ForrestGump['Pearson'], how='left')
mixed_ForrestGump.drop(356, inplace=True) # drop Forrest Gump itself

#The 'lin. 70.0%' column ranges from 0 to ~1.
#We filter out all rows below a threshold 0.7 and then keep only the first 10 movies in terms of similatities to Forrest Gump
mixed_ForrestGump.loc[mixed_ForrestGump['lin. 70.0%'] > 0.7].sort_values(by='Pearson',ascending=False).head(10)

#### 3.1.2 Similarities for a generic movie

In [None]:
#I first want a list of movies a user have already seen ordered by rating
#Then I'll make a suggestion based on this


def favourite_movies(user_id,n,ratings,movies):
    best_movieId = ratings.loc[(ratings['userId'] == user_id) & (ratings['userId'] >0)].sort_values(by="rating", ascending=False).head(n)['movieId'].tolist()

    best_movies = {}
    for movieId in best_movieId:
        best_movies[movieId] = movies.loc[movies['movieId'] == movieId, 'title'].iloc[0]
    return best_movies
favourite_movies(33,10,ratings,movies)


In [None]:
movies.loc[movies['movieId']==15,['title']].iloc[0,0]

In [None]:
# now we condense all the steps above in a unique function
# arguments: {movie_name: movie name, n:most similar n-movies}

def item_based_collaborative_filtering(movie_name,n):
    
    #map the movie_name into movieId
    movieID = movies.loc[movies['title'] == movie_name,'movieId'].values[0]

    #pivot table
    ratings_pivot = pd.pivot_table(data = ratings, values='rating', index='userId', columns='movieId')
    
    #create a pandas df with the correlations of the other movies
    similar_to_movieID = ratings_pivot.corrwith(ratings_pivot[movieID])
    corr_movieID = pd.DataFrame(similar_to_movieID, columns = ['Pearson'])
    corr_movieID.dropna(inplace = True) #drop the NaNs

    #Construct a df of (movies) VS (Pearson, popularity_metric)
    mixed_movieID = linear_hybrid(len(popularity),popularity, 0.5)[['lin. 50.0%']].join(corr_movieID['Pearson'], how='left')
    #Drop movieID
    mixed_movieID.drop(movieID, inplace=True)
    #We also drop NaNs
    mixed_movieID.dropna(inplace = True) #drop the NaNs
    #Filter out all rows below a threshold 0.7 and then keep only the first n movies in terms of similatities to movieID
    return mixed_movieID.loc[mixed_movieID['lin. 50.0%'] > 0.5].sort_values(by='Pearson',ascending=False).head(n)


item_based_collaborative_filtering("Father of the Bride Part II (1995)",10)

In [None]:
item_based_collaborative_filtering("Layer Cake (2004)",6).index.to_list()

### 3.2 User-based collaborative filtering

To create a user-based collaborative recommender we are going to go through a very similar process as we did with the item-based recommender. This time though we’re going to calculate the cosine similarity between users, instead of between movies.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

#create a users-items table
user_item = pd.pivot_table(data=ratings, values='rating', index='userId', columns='movieId')

#replace NaNs with zeros
user_item.fillna(0,inplace=True)

#cosine similarities
cos_sim = pd.DataFrame(data=cosine_similarity(user_item), index=user_item.index, columns=user_item.index)

Let us now focus on one user `uID=30`.

The goal is to estimate the numbers where `uID` did not give a rate. So, first of all we have to identify those movies with rating = 0. As a result we get an array of `movieId` which we call missing_movies.

For any movie in missing_movies we calculate the estimated rating $r_{\text{u}_\text{ID}}$ as

$$r_{\text{u}_\text{ID}}= \sum_{i\neq\text{u}_\text{ID}
} w_i r_i$$

where $r_i$ the true rating of the other users ad $w_i$ is the similarity weight defined as

$$w_i = \frac{c_i}{\sum_{i\neq\text{u}_\text{ID}}c_i} $$

where $c_i$ are is the cosine similarity of the $i$-th user and $w_i$ its weight.

In [None]:
uID = 300

#find the unrated movies and the ratings of the other users
unseen_rating_uID = user_item.loc[user_item.index!=uID,user_item.loc[uID,:]==0]

#calculate weights
weights_uID = cos_sim.query('userId!=@uID')[uID]/sum(cos_sim.query('userId!=@uID')[uID])

#construct the predicted_rating by means of the dot product
predicted_uID = pd.DataFrame(unseen_rating_uID.T.dot(weights_uID), columns = ["predicted_rate"]).sort_values(by="predicted_rate",ascending=False)

In [None]:
#to find the top 5 UNRATED movies we have to merge our findings with the original table
recommendations = predicted_uID.merge(movies, left_index=True, right_on="movieId")
recommendations.sort_values("predicted_rate", ascending=False).head(5)

#### 3.2.1 The function

In [None]:
def special_for_you(uID,n):
    
    #find the unrated movies and the ratings of the other users
    unseen_rating_uID = user_item.loc[user_item.index!=uID,user_item.loc[uID,:]==0]
    
    #calculate weights
    weights_uID = cos_sim.query('userId!=@uID')[uID]/sum(cos_sim.query('userId!=@uID')[uID])
    
    #construct the predicted_rating by means of the dot product
    predicted_uID = pd.DataFrame(unseen_rating_uID.T.dot(weights_uID), columns = ["predicted_rate"]).sort_values(by="predicted_rate",ascending=False)
    
    #to find the top 5 UNRATED movies we have to merge our findings with the original table
    recommendations = predicted_uID.merge(movies, left_index=True, right_on="movieId")
    
    return recommendations.sort_values("predicted_rate", ascending=False).head(n)

In [None]:
special_for_you(47,5).iloc[:,1].tolist()

## 4. Function for scraping images

In [None]:
def movie_image(movie_id,links):
    try:
        # Make the request to the API
        endpoint = f'https://api.themoviedb.org/3/movie/{int(links.loc[links["movieId"] == movie_id].iloc[0, 2])}/images?api_key={api_key}'
        response = requests.get(endpoint)
        data = response.json()

        # Get the first image URL from the response
        image_url = data['backdrops'][0]['file_path']

        # Build the full image URL
        image_base_url = 'https://image.tmdb.org/t/p/original'
        full_image_url = f'{image_base_url}{image_url}'

        # Download the image
        response = requests.get(full_image_url)
        image = Image.open(BytesIO(response.content))

        # Resize the image to the desired size
        image = image.resize(image_size)

        # Create a figure and display the image
        fig, ax = plt.subplots()
        ax.imshow(image)
        ax.axis('off')

        # Return the figure
        return fig

    except:
        # If there is an error, display a placeholder image
        placeholder_url = f'https://via.placeholder.com/{image_size[0]}x{image_size[1]}?text=Image+Not+Found'
        response = requests.get(placeholder_url)
        image = Image.open(BytesIO(response.content))

        # Resize the placeholder image to the desired size
        image = image.resize(image_size)

        # Create a figure and display the placeholder image
        fig, ax = plt.subplots()
        ax.imshow(image)
        ax.axis('off')

        # Return the figure
        return fig
    

In [None]:
# FAQ dictionary
faq_dict = {
    "What is your name?": "I am ChatBot, your personal movie recommender!",
    "Tell me a joke": "Sure, here you go: Why don't scientists trust atoms? Because they make up everything!"
}

def chat_bot():
    print("Hi! I'm your personal recommender. How can I assist you today?")
    user_input = input().lower()

    # Check if user input matches a FAQ question
    if user_input in faq_dict:
        print(faq_dict[user_input])
        return
    
    print("Tell me your userID.")
    try:
        user_id = int(input())
    except ValueError:
        print("Please enter a valid user ID.")
        return
    
    print("How many recommendations do you want to get?")
    try:
        n = int(input())
    except ValueError:
        print("Please enter a valid number of recommendations.")
        return
    
    print("Here are the recommendations:")
    recommendations = recommend_movies(user_id, n)
    
    if recommendations.empty:
        print("No recommendations found.")
        return
    
    print(recommendations['title'])

chat_bot()


In [None]:
faq_dict