In [44]:
import pandas as pd
import numpy as np

## Data cleaning

In [45]:
def standardize_title_and_year(title):
    """
    Standardize movie title and extract year.
    Input example: "Matrix, The (1999)" or "The Matrix (1999)" or "Matrix"
    Returns: (standardized_title, year)
    """
    # Extract year if present
    year = None
    clean_title = title
    if '(' in title and ')' in title:
        year_part = title[title.rfind('(')+1:title.rfind(')')]
        if year_part.isdigit():
            year = int(year_part)
        clean_title = title.split('(')[0].strip()
    
    # Handle ", The/A/An" format
    lower_title = clean_title.lower()
    if ', the' in lower_title:
        clean_title = 'The ' + clean_title.split(',')[0]
    elif ', a' in lower_title:
        clean_title = 'A ' + clean_title.split(',')[0]
    elif ', an' in lower_title:
        clean_title = 'An ' + clean_title.split(',')[0]
    
    return clean_title.strip(), year

In [46]:
movies = pd.read_csv('archive/ml-latest/movies.csv')
ratings = pd.read_csv('archive/ml-latest/ratings.csv')

In [47]:
# calculate average rating per movie and filter out movies with no ratings
ratings = ratings.drop(['timestamp', 'userId'], axis=1)
avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
movies = movies[movies["movieId"].isin(avg_ratings["movieId"])]
movies.reset_index(drop=True, inplace=True)
movies["rating"] = avg_ratings["rating"]

In [48]:

# Apply the title standardization 
standardized_data = movies['title'].apply(standardize_title_and_year)
movies['standardized_title'] = standardized_data.apply(lambda x: x[0])
movies['year'] = standardized_data.apply(lambda x: x[1])

# Print examples to verify
print("\nExample transformations:")
print(movies[['title', 'standardized_title', 'year']].head(10))

# Print some statistics
print("\nStatistics:")
print(f"Total movies: {len(movies)}")
print(f"Movies with extracted year: {movies['year'].notna().sum()}")
print(f"Movies without year: {movies['year'].isna().sum()}")


Example transformations:
                                title           standardized_title    year
0                    Toy Story (1995)                    Toy Story  1995.0
1                      Jumanji (1995)                      Jumanji  1995.0
2             Grumpier Old Men (1995)             Grumpier Old Men  1995.0
3            Waiting to Exhale (1995)            Waiting to Exhale  1995.0
4  Father of the Bride Part II (1995)  Father of the Bride Part II  1995.0
5                         Heat (1995)                         Heat  1995.0
6                      Sabrina (1995)                      Sabrina  1995.0
7                 Tom and Huck (1995)                 Tom and Huck  1995.0
8                 Sudden Death (1995)                 Sudden Death  1995.0
9                    GoldenEye (1995)                    GoldenEye  1995.0

Statistics:
Total movies: 83239
Movies with extracted year: 82660
Movies without year: 579


In [49]:
# Filter out movies with no genres or no year
movies = movies[~movies["movieId"].isin(movies[(movies["genres"] == "(no genres listed)") | (movies["year"].isna())].movieId)]

In [50]:
# Replace empty strings with NaN in all columns of the movies dataframe
movies = movies.replace(r'^\s*$', pd.NA, regex=True)

In [51]:
# Check for NaN values in standardized_title column
print("Number of NaN values in standardized_title:", movies['standardized_title'].isna().sum())

# Display rows with NaN standardized_title if any exist
nan_titles = movies[movies['standardized_title'].isna()]
print("\nRows with NaN standardized_title:")
nan_titles


Number of NaN values in standardized_title: 14

Rows with NaN standardized_title:


Unnamed: 0,movieId,title,genres,rating,standardized_title,year
13494,69757,(500) Days of Summer (2009),Comedy|Drama|Romance,3.735749,,2009.0
15284,80729,(Untitled) (2009),Comedy|Drama,3.441176,,2009.0
22401,115263,(A)sexual (2011),Documentary,3.025,,2011.0
33723,145733,(The New) War of the Buttons (2011),Adventure|Children,2.933333,,2011.0
34255,147033,(T)ERROR (2015),Documentary|Thriller,3.522727,,2015.0
39377,160010,(Dis)Honesty: The Truth About Lies (2015),Documentary,3.757576,,2015.0
54446,193219,(Girl)Friend (2018),Comedy|Romance,2.5,,2018.0
62103,210479,(2019),Drama|Thriller,2.675676,,2019.0
62710,211946,(UN)Ideal Man (2020),Comedy|Romance|Sci-Fi,2.714286,,2020.0
64259,215643,(OO) (2017),Animation,3.25,,2017.0


In [52]:
# Manually set standardized titles for selected nan entries
movies.loc[movies['movieId'] == 69757, 'standardized_title'] = "500 Days of Summer"
movies.loc[movies['movieId'] == 80729, 'standardized_title'] = "Untitled"
movies.loc[movies['movieId'] == 115263, 'standardized_title'] = "Asexual" 
movies.loc[movies['movieId'] == 145733, 'standardized_title'] = "The New War of the Buttons"
movies.loc[movies['movieId'] == 147033, 'standardized_title'] = "Terror"
movies.loc[movies['movieId'] == 160010, 'standardized_title'] = "Dishonesty The Truth About Lies"
movies.loc[movies['movieId'] == 193219, 'standardized_title'] = "Girlfriend"
movies.loc[movies['movieId'] == 208553, 'standardized_title'] = "Escape"
movies.loc[movies['movieId'] == 230315, 'standardized_title'] = "Nieznajomi"
movies.loc[movies['movieId'] == 211946, 'standardized_title'] = "Unideal Man"
movies.loc[movies['movieId'] == 215643, 'standardized_title'] = "OO"
movies.loc[movies['movieId'] == 234516, 'standardized_title'] = "My Truth: The Rape of 2 Coreys"
movies.loc[movies['movieId'] == 250664, 'standardized_title'] = "Blooper Bunny!"

# Verify the changes
print("Number of remaining NaN values:", movies['standardized_title'].isna().sum())


Number of remaining NaN values: 2


In [53]:
movies[movies["standardized_title"].isna()]

Unnamed: 0,movieId,title,genres,rating,standardized_title,year
62103,210479,(2019),Drama|Thriller,2.675676,,2019.0
79806,278484,() (2003),Documentary,1.0,,2003.0


In [54]:
# remove 2 last nan entries
movies = movies[~movies["standardized_title"].isna()]

In [56]:
movies.to_csv("data/cleaned.csv", index=False)

In [59]:
movies

Unnamed: 0,movieId,title,genres,rating,standardized_title,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893508,Toy Story,1995.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.278179,Jumanji,1995.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.171271,Grumpier Old Men,1995.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.868395,Waiting to Exhale,1995.0
4,5,Father of the Bride Part II (1995),Comedy,3.076957,Father of the Bride Part II,1995.0
...,...,...,...,...,...,...
83234,288967,State of Siege: Temple Attack (2021),Action|Drama,3.500000,State of Siege: Temple Attack,2021.0
83235,288971,Ouija Japan (2021),Action|Horror,0.500000,Ouija Japan,2021.0
83236,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary,4.000000,The Men Who Made the Movies: Howard Hawks,1973.0
83237,288977,Skinford: Death Sentence (2023),Crime|Thriller,3.000000,Skinford: Death Sentence,2023.0


## Calculate genre one-hot matrix

In [60]:
# split genres string into list and create one-hot encoding
genres_list = movies["genres"].str.split("|")
num_movies = movies.shape[0]

unique_genres = set([genre for genres in genres_list for genre in genres])
genre_matrix = np.zeros((num_movies, len(unique_genres)))

for i, genres in enumerate(genres_list):
    for genre in genres:
        genre_matrix[i, list(unique_genres).index(genre)] = 1
        
genre_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(76493, 19))

In [61]:
np.save("genre_matrix_full.npy", genre_matrix)

In [8]:
import requests

# Make request to the recommender API
response = requests.post(
    "http://localhost:8000/recommend",
    json={
        "movie_title": "The Matrix",
        "year": 1999
    }
)

# Print the response
print(response.json())


{'recommendations': [{'title': 'Tad the Lost Explorer and the Curse of the Mummy (2022)', 'genres': 'Adventure|Animation|Children|Comedy|Fantasy', 'rating': 1.1, 'standardized_title': 'Tad the Lost Explorer and the Curse of the Mummy', 'year': 2022.0}, {'title': 'Who Framed Roger Rabbit? (1988)', 'genres': 'Adventure|Animation|Children|Comedy|Crime|Fantasy|Mystery', 'rating': 3.543433357118714, 'standardized_title': 'Who Framed Roger Rabbit?', 'year': 1988.0}, {'title': 'Legends of Valhalla: Thor (2011)', 'genres': 'Adventure|Animation|Children|Comedy|Fantasy', 'rating': 4.5, 'standardized_title': 'Legends of Valhalla: Thor', 'year': 2011.0}], 'query': {'title': 'Matrix, The (1999)', 'rating': 4.160630884770588, 'year': 1999}}
