In [1]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/alx-movie-recommendation-project-2024/sample_submission.csv
/kaggle/input/alx-movie-recommendation-project-2024/movies.csv
/kaggle/input/alx-movie-recommendation-project-2024/imdb_data.csv
/kaggle/input/alx-movie-recommendation-project-2024/genome_tags.csv
/kaggle/input/alx-movie-recommendation-project-2024/genome_scores.csv
/kaggle/input/alx-movie-recommendation-project-2024/train.csv
/kaggle/input/alx-movie-recommendation-project-2024/test.csv
/kaggle/input/alx-movie-recommendation-project-2024/tags.csv
/kaggle/input/alx-movie-recommendation-project-2024/links.csv


In [2]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV

In [3]:
# Load data (assuming the file paths are correct)
train = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/train.csv')
test = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/test.csv')
movies = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/movies.csv')
tags = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/tags.csv')
genome_scores = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/genome_scores.csv')
genome_tags = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/genome_tags.csv')


In [4]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)


In [5]:
trainset, testset = train_test_split(data, test_size=0.2)


In [8]:
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 50, 100],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.05, 0.1]
}



In [None]:
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=-1, pre_dispatch='2*n_jobs')
gs.fit(data)

In [None]:
best_params = gs.best_params['rmse']
print("Best parameters:", best_params)


In [None]:
best_model = SVD(n_factors=best_params['n_factors'], n_epochs=best_params['n_epochs'], 
                 lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])
best_model.fit(trainset)

# Evaluate model using cross-validation
cv_results = cross_validate(best_model, data, measures=['RMSE'], cv=3, verbose=True)
print(f"Cross-validation results: {cv_results}")

In [None]:
predictions = []
for _, row in test.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    pred = best_model.predict(user_id, movie_id).est
    predictions.append(pred)

In [None]:
test['rating'] = predictions
test['Id'] = test['userId'].astype(str) + '_' + test['movieId'].astype(str)
submission = test[['Id', 'rating']]
submission.to_csv('/kaggle/working/submission.csv', index=False)