In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
import pickle


# Data Collection

In [None]:
# Load dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Clean data
def clean_data(data):
    data.dropna(inplace=True)
    return data

# Split data
def split_data(data):
    train, test = train_test_split(data, test_size=0.2, random_state=42)
    return train, test

# Example using the MovieLens dataset
data = load_data('ml-latest-small/ml-latest-small/movies.csv')
cleaned_data = clean_data(data)
train_data, test_data = split_data(cleaned_data)
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)


# Model Training

In [None]:
# Load training data
train_data = pd.read_csv('../data/raw/train_data.csv')

# Prepare data for Surprise library
reader = Reader(rating_scale=(train_data['rating'].min(), train_data['rating'].max()))
data = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

# Build and evaluate model
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Train the model on the whole dataset
trainset = data.build_full_trainset()
algo.fit(trainset)

# Save the trained model
with open('recommendation_model.pkl', 'wb') as f:
    pickle.dump(algo, f)
