# All dependencies

In [None]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

## Reading data

In [1]:
import pandas as pd
ratings = pd.read_csv('datasets/ml-25m/ratings.csv')
movies = pd.read_csv('datasets/ml-25m/movies.csv')
print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


## Preprocessing data

In [2]:
# Preprocessing (Py3.11 compatible)
movies['genres'] = movies['genres'].str.split('|')  # Convert genres to list
ratings = ratings.merge(movies[['movieId', 'genres']], on='movieId')

# Filter active users/movies (min 50 ratings)
min_ratings = 50
active_users = ratings['userId'].value_counts()[ratings['userId'].value_counts() >= min_ratings].index
ratings_filtered = ratings[ratings['userId'].isin(active_users)]

## Model Training (Collaborative Filtering)

In [5]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split 

# Load data into Surprise format
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_filtered[['userId', 'movieId', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD model
model = SVD(n_factors=100, n_epochs=20, random_state=42)
model.fit(trainset)

# Evaluate
from surprise import accuracy
predictions = model.test(testset)
accuracy.rmse(predictions)  # Target RMSE < 0.90

RMSE: 0.7651


0.7651408890599997

In [12]:
import pickle
# Save
pickle.dump(model, open("model.pkl", "wb"))

## Function for making recommendations

In [10]:
survey_data = pd.read_csv('datasets/survey.csv')  # Load from file

# Hybrid recommender function
def hybrid_recommend(user_id, n=5):
    # Collaborative Filtering predictions
    all_movies = ratings['movieId'].unique()
    cf_predictions = [model.predict(user_id, mid).est for mid in all_movies]
    
    # Content-based boosting (if user exists in survey)
    if user_id in survey_data['userId'].values:
        user_row = survey_data[survey_data['userId'] == user_id].iloc[0]
        # Get genre weights (e.g., action_rating, comedy_rating from CSV)
        genre_weights = {
            'Action': user_row['action_rating'],
            'Comedy': user_row['comedy_rating'],
            'Sci-Fi': user_row['sci_fi_rating']
        }
        # Score movies by genre alignment
        genre_scores = movies['genres'].apply(
            lambda g: sum(genre_weights.get(genre, 0) for genre in g)
        )
    else:
        genre_scores = 0  # Default if no survey data
    
    # Combine scores (60% CF + 40% genre)
    combined_scores = 0.6 * np.array(cf_predictions) + 0.4 * genre_scores
    top_indices = np.argsort(combined_scores)[-n:][::-1]
    return movies.iloc[top_indices]['title']

In [11]:
# app.py
import streamlit as st

st.title("🎬 Movie Recommender (Python 3.11)")
user_id = st.number_input("Enter User ID", min_value=1, max_value=ratings['userId'].max())

if st.button("Recommend"):
    recommendations = hybrid_recommend(int(user_id))
    st.write("Top Recommendations:")
    for movie in recommendations:
        st.write(f"- {movie}")

2025-04-19 11:18:13.091 
  command:

    streamlit run C:\Users\danis\anaconda3\envs\movie_rec\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
