## Dataset analysis and pre-processing

In [None]:
import pandas as pd

# Dataset loading
file_path = "C:/Users/beltr/OneDrive/Desktop/DDSE_PROJECT/MoviesRecommender/data/filmtv_movies.csv"
dataset = pd.read_csv(file_path)

### Dataset analysis

In [None]:
# First dataset rows inspection
dataset.head()

In [None]:
# Columns and data type check
dataset.info()

In [None]:
# Unique values count
dataset.nunique(axis=0, dropna=True)

### Missing values management

In [None]:
# Missing values count
dataset.isnull().sum()

Since the columns 'genre', 'country' and 'directors' have very few missing values, I drop rows where at least one of these columns has a null value.

In [None]:
# Drop rows where 'genre', 'country', or 'directors' have missing values
dataset = dataset.dropna(subset=['genre', 'country', 'directors'])
dataset.info()
dataset.isnull().sum()

### Duplicates check
I have searched in the dataset for duplicates row. As it can be seen below, there are no duplicates in the dataset. 

In [None]:
# Duplicates row search
print(dataset.duplicated().sum())

### 'notes' column drop
Since the 'notes' columns does not add relevant information about movies and has more than 20k missing values, I decided to drop it.

In [None]:
# Drop 'notes' column
dataset = dataset.drop(columns=['notes'])
dataset.columns

### Data type conversion

In [None]:
# Convert the selected columns to string type
dataset['title'] = dataset['title'].astype(str)
dataset['actors'] = dataset['actors'].astype(str)
dataset['directors'] = dataset['directors'].astype(str)
dataset['description'] = dataset['description'].astype(str)
dataset['country'] = dataset['country'].astype(str)

dataset.info()

### Votes analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Get summary statistics of vote counts
dataset['total_votes'].describe()

In [None]:
# Plot the distribution of vote counts
sns.histplot(dataset['total_votes'], bins=50, kde=True)
plt.title("Distribution of Number of Votes")
plt.xlabel("Number of Votes")
plt.ylabel("Frequency")
plt.yscale('log')
plt.show()

In [None]:
sns.histplot(dataset['total_votes'], bins=50, kde=False, color='blue')
plt.title('Distribution of Total Votes')
plt.xlabel('Total Votes')
plt.ylabel('Number of Movies')
plt.yscale('log')  # Optional: Use log scale for better visualization if data is skewed
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=dataset, x='total_votes', y='avg_vote', alpha=0.5, color='blue')
plt.title('Number of Votes vs Average Rating')
plt.xlabel('Number of Votes')
plt.ylabel('Average Rating')
plt.xscale('log')  # Optional: Use log scale for better visibility of outliers
plt.grid(True)
plt.show()

In [None]:
# Group the dataset by 'total_votes' and count the number of movies for each vote count
votes_count = dataset['total_votes'].value_counts().sort_index()

# Plot the data
plt.figure(figsize=(12, 6))
plt.bar(votes_count.index, votes_count.values, color='blue', width=1.0)
plt.title('Number of Movies for Each Number of Votes')
plt.xlabel('Number of Votes')
plt.ylabel('Number of Movies')
plt.xlim(0, 100)  # Optional: Limit x-axis to a smaller range for better visibility
plt.yscale('log')  # Optional: Use log scale if the distribution is heavily skewed
plt.show()

### Movies ratings
This dataset, for each movie, is taking into account:
- average of all the votes;
- public votes;
- critics votes;
- number of all the votes received.

Since I want the final rating of a movie to be a single number and 'public_vote' and 'critics_vote' columns have many missing values, I decided to drop these columns and to base movies ratings on:
- the average of all the votes received by the movie;
- the number of all the votes received by the movie.

'avg_vote' and 'total_votes' columns have not missing values. This has encouraged me on considering only these two columns for the "weighted ratings" even more.

In [None]:
# Drop 'public_vote' and 'critics_vote' column
dataset = dataset.drop(columns=['public_vote', 'critics_vote'])
dataset.columns

As I stated before, the final rating column will represent a weighted rating based on the average and the number of votes received by each movie.

The weighted rating will "weight" more movies with high number of total votes received. This way, the most reliable rating will be the ones with high number of votes. Insteead, movies with a low number of votes will be rated with a value closer to the average of all the votes in the dataset than its own average vote value. This means that movies with few votes will count less, since they are not so realiable.

This rating system base its final votes using some parameters/constants:
- `global_avg`: the global average of the 'avg_votes' between all the movies in the dataset;
- `min_num_of_votes`: constant that represents the minimun number of votes a movie must have to be considered "reliable" (with respect on the 'avg_vote').

I decided to set `min_num_of_votes` to 36 because the 75% of the movies in the dataset have 36 or fewer votes (see the "Votes analysis" section and the "Distribution of Weighted Ratings for Different `min_num_of_votes` Values" plot).

In [None]:
global_avg = dataset['avg_vote'].mean()
print("Global average: ", global_avg)

min_num_of_votes = 36

# Weighted rating definition
def weighted_rating(x, m=min_num_of_votes, G=global_avg):
    v = x['total_votes']  # Number of votes
    R = x['avg_vote']   # Average rating
    return (v / (v + m) * R) + (m / (v + m) * G)

# Weigthed rating application to the dataset
dataset['weighted_rating'] = dataset.apply(weighted_rating, axis=1)

In [None]:
dataset.columns

In [None]:
# Sort by the weighted rating
top_movies_weighted_discending = dataset.sort_values('weighted_rating', ascending=False)

# Display the top 10 movies sorting by the weighted rating
print("Discending ratings based on weights:")
print(top_movies_weighted_discending[['title', 'avg_vote', 'total_votes', 'weighted_rating']].head(10))

# Sort by the weighted rating
top_movies_weighted_ascending = dataset.sort_values('weighted_rating', ascending=True)

# Display the top 10 movies sorting by the weighted rating
print("Ascending ratings based on weights:")
print(top_movies_weighted_ascending[['title', 'avg_vote', 'total_votes', 'weighted_rating']].head(10))

# Sort by the weighted rating
top_movies_avg_discenging = dataset.sort_values('avg_vote', ascending=False)

# Display the top 10 movies sorting by the weighted rating
print("Discending ratings based on avg:")
print(top_movies_avg_discenging[['title', 'avg_vote', 'total_votes', 'weighted_rating']].head(10))

# Sort by the weighted rating
top_movies_avg_ascending = dataset.sort_values('avg_vote', ascending=True)

# Display the top 10 movies sorting by the weighted rating
print("Ascending ratings based on avg:")
print(top_movies_avg_ascending[['title', 'avg_vote', 'total_votes', 'weighted_rating']].head(10))

In [None]:
import numpy as np

# Test different values of m (e.g., median, 75th percentile, 90th percentile)
percentiles = [25, 50, 75, 90]  # Percentiles to test
m_values = [np.percentile(dataset['total_votes'], p) for p in percentiles]

# Create a new column for each weighted rating based on different m values
for m_value in m_values:
    dataset[f'weighted_rating_m_{m_value}'] = dataset.apply(weighted_rating, m=m_value, axis=1)

# Plot the distribution of weighted ratings for different m values
plt.figure(figsize=(12, 6))

# Use a boxplot to compare the distribution of weighted ratings for each m value
ratings_columns = [f'weighted_rating_m_{m_value}' for m_value in m_values]
plt.boxplot([dataset[col] for col in ratings_columns], labels=[f'm = {m_value}' for m_value in m_values])

plt.title('Distribution of Weighted Ratings for Different min_num_of_votes Values (m)')
plt.ylabel('Weighted Rating')
plt.xlabel('m Value (Percentile)')
plt.show()


plt.figure(figsize=(12, 6))
plt.boxplot(dataset['avg_vote'])

plt.title('Distribution of Average Votes (avg_vote)')
plt.xlabel('Average Vote')
plt.grid(True)
plt.show()


dataset = dataset.drop(columns=['weighted_rating_m_5.0',
                                'weighted_rating_m_12.0',
                                'weighted_rating_m_36.0',
                                'weighted_rating_m_97.0'])
dataset.columns

"Distribution of Weighted Ratings for Different `min_num_of_votes` Values" plot comments considering also results :
- `min_num_of_votes = 5`: ratings rely too much on movies with few votes (weighted votes results not so reliable indeed);
- `min_num_of_votes = 12`: more balanced solution. Weighted ratings rely on movie with a higher number of votes;
- `min_num_of_votes = 36`: another balanced solution. Weighted ratings rely on movie with a higher number of votes and are a well spreaded
- `min_num_of_votes = 96`: exaggerated considering than 75% of the movies have less than 36 votes...

### Duration analysis

In [None]:
# Group the dataset by 'duration' and count the number of movies for each duration count
votes_count = dataset['duration'].value_counts().sort_index()

# Plot the data
plt.figure(figsize=(12, 6))
plt.bar(votes_count.index, votes_count.values, color='red', width=1.0)
plt.title('Number of Movies for Each Duration')
plt.xlabel('Duration')
plt.ylabel('Number of Movies')
plt.xlim(0, 250)  # Optional: Limit x-axis to a smaller range for better visibility
plt.yscale('log')  # Optional: Use log scale if the distribution is heavily skewed
plt.show()

In [None]:
# Get summary statistics of durations
print(dataset['duration'].describe())

print(dataset['duration'].quantile(0.90))

### Correlation analysis

In [None]:
# Select only numerical columns
numerical_columns = dataset.select_dtypes(include=['float64', 'int64']).columns

# Compute the correlation matrix
correlation_matrix = dataset[numerical_columns].corr()

# Display the correlation matrix
print(correlation_matrix)

# Visualize the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
plt.title("Correlation Matrix")
plt.show()

## Dataset filtering

### Actors and director count (and extraction)
I want to extract from the "actors" and "directors" columns the number of actors and directors in the dataset.
Since these columns contain multiple names for each movie, I have to extract single names.

In [None]:
# Extract the "directors" column
directors = dataset['directors']

# Drop missing values (if any)
directors = directors.dropna()

# Split comma-separated director names into a list (if applicable)
director_list = directors.str.split(",").explode().str.strip()

# Get unique director names
unique_directors = director_list.unique()
number_of_directors = len(unique_directors)
print("Number of directors in the dataset: ", number_of_directors)

# Print the list of unique directors
# for index, director in enumerate(unique_directors):
#     print(index, director)

In [None]:
# Extract the "actors" column
actors = dataset['actors']

# Drop missing values (if any)
actors = actors.dropna()

# Split comma-separated actor names into a list (if applicable)
actor_list = actors.str.split(",").explode().str.strip()

# Get unique actor names
unique_actors = actor_list.unique()
number_of_actors = len(unique_actors)
print("Number of actors in the dataset: ", number_of_actors)

# Print the list of unique actors
# for index, actor in enumerate(unique_actors):
#     print(index, actor)

In [None]:
# Extract the "conuntry" column
countries = dataset['country']

# Drop missing values (if any)
countries = countries.dropna()

# Split comma-separated contry names into a list (if applicable)
country_list = countries.str.split(",").explode().str.strip()

# Get unique countries
unique_countries = country_list.unique()
number_of_countries = len(unique_countries)
print("Number of countries in the dataset: ", number_of_countries)

# Print the list of unique countries
for index, country in enumerate(unique_countries):
    print(index, country)

### Movies classification with respect to duration

In [None]:
count = 0
durata = dataset['duration']

# Drop missing values (if any)
durata = durata.dropna()

for minutes in durata:
    if minutes < 1800:
        count += 1
print(count)

dataset['duration_category'] = pd.cut(durata, bins=[0, 60, 120, 180, float('inf')], 
                                 labels=['Short', 'Medium', 'Long', 'Epic'])

print(dataset.head())
dataset.columns

### Filtering method

In [None]:
def filter_movies(dataset, genre=None, max_duration=None, actors=None, directors=None, start_year=None, end_year=None):
    """
    Filters the movies dataset based on user inputs.
    
    Parameters:
        dataset (DataFrame): The dataset of movies.
        genre (str): Filter movies by genre.
        max_duration (int): Maximum duration of movies.
        actors (str): Filter movies by actor name (partial or full match).
        directors (str): Filter movies by director name (partial or full match).
        start_year (int): Start year for filtering movies (inclusive).
        end_year (int): End year for filtering movies (inclusive).
    
    Returns:
        DataFrame: Filtered dataset based on the given parameters.
    """
    filtered = dataset.copy()

    # Filter by genre
    if genre:
        filtered = filtered[filtered['genre'].str.contains(genre, case=False, na=False)]

    # Filter by duration
    if max_duration is not None:
        filtered = filtered[filtered['duration'] <= max_duration]

    # Filter by actors
    if actors:
        filtered = filtered[filtered['actors'].str.contains(actors, case=False, na=False)]

    # Filter by directors
    if directors:
        filtered = filtered[filtered['directors'].str.contains(directors, case=False, na=False)]

    # Filter by year
    if start_year is not None:
        filtered = filtered[filtered['year'] >= start_year]
    if end_year is not None:
        filtered = filtered[filtered['year'] <= end_year]

    return filtered

In [None]:
dataset.columns

In [None]:
selection = filter_movies(dataset, actors="Jack Nicholson", start_year=1980, end_year=2010, max_duration=120)
selection.sort_values(by='weighted_rating', ascending=False)

In [None]:
def search_movie(dataset, title=None):
    filtered = dataset.copy()
    if title:
        filtered = filtered[filtered['title'].str.contains(title, case=False, na=False)]
    return filtered

In [None]:
movie = search_movie(dataset, title="The Shining")
movie

## Pre-processed dataset saving

In [None]:
# Save the dataset in the 'data' folder
output_path = "C:/Users/beltr/OneDrive/Desktop/DDSE_PROJECT/MoviesRecommender/data/preprocessed_filmtv_movies.csv"
dataset.to_csv(output_path, index=False) # index=False: row indexes are not saved in the file