In [1]:
import pandas as pd

## Data Processing 

In [2]:
# Load the ratings data
ratings = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load the movies data
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None,
                     names=['item_id', 'title'] + [f'genre_{i}' for i in range(19)])

# Load the user data
users = pd.read_csv('ml-100k/u.user', sep='|', header=None, 
                    names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

In [3]:
users

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [4]:
user_interactions = ratings.groupby('user_id').size()
user_indexes = user_interactions[user_interactions > 200].index 
ratings = ratings[ratings.user_id.isin(user_indexes)]
ratings

Unnamed: 0,user_id,item_id,rating,timestamp
3,244,51,2,880606923
8,305,451,3,886324817
9,6,86,3,883603013
10,62,257,2,879372434
11,286,1014,5,879781125
...,...,...,...,...
99994,378,78,3,880056976
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795


In [5]:
movies = movies[movies.index.get_level_values(0).isin(ratings.item_id)]
movies = movies.dropna(axis=1)
movies

Unnamed: 0,Unnamed: 1,Unnamed: 2,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1675,"Sunchaser, The (1996)",25-Oct-1996,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1676,"War at Home, The (1996)",01-Jan-1996,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1677,Sweet Nothing (1995),20-Sep-1996,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1681,You So Crazy (1994),01-Jan-1994,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


## Results

In [6]:
from sklearn.model_selection import train_test_split
import random 

test_user = random.choice(ratings.user_id.unique())

def get_data_for_user(ratings, movies, user_id):    
    temp_ratings = ratings[ratings.user_id == user_id]
    temp_movies = movies[movies.index.get_level_values(0).isin(temp_ratings.item_id)]
    temp_data = temp_movies.merge(temp_ratings, left_on=temp_movies.index.get_level_values(0), right_on='item_id').drop(['item_id', 'user_id', 'timestamp'], axis=1) 
    X = temp_data.drop('rating', axis=1)
    y = temp_data['rating']
    return train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = get_data_for_user(ratings, movies, test_user)

param_grid = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 50, 100],
            'max_depth': [None, 10, 20]
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=1000, random_state=42),
        'params': {
            'hidden_layer_sizes': [(100,), (100, 50), (150,), (150, 75)],
            'activation': ['relu'],
            'solver': ['adam'],
            'alpha': [0.0001, 0.001, 0.01]
        }
    }
}

# Perform grid search for each model
best_models = {}
for model_name, model_info in param_grid.items():
    grid_search = GridSearchCV(model_info['model'], model_info['params'], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

best_models

{'LogisticRegression': LogisticRegression(C=0.1),
 'RandomForest': RandomForestClassifier(n_estimators=10),
 'DecisionTree': DecisionTreeClassifier(min_samples_split=10),
 'MLPClassifier': MLPClassifier(hidden_layer_sizes=(150, 75), max_iter=1000, random_state=42)}

In [8]:
results = {}

for model_name, model in best_models.items():
    results[model_name] = []

for user in ratings.user_id.unique():
    X_train, X_test, y_train, y_test = get_data_for_user(ratings, movies, user)

    for model_name, model in best_models.items():
        model.fit(X_train, y_train)
        model.score(X_test, y_test)
        results[model_name].append(model.score(X_test, y_test))


In [9]:
results

{'LogisticRegression': [0.375,
  0.417910447761194,
  0.46875,
  0.38571428571428573,
  0.27586206896551724,
  0.4461538461538462,
  0.2945205479452055,
  0.3695652173913043,
  0.4943820224719101,
  0.4583333333333333,
  0.44047619047619047,
  0.4666666666666667,
  0.2857142857142857,
  0.4307692307692308,
  0.4578313253012048,
  0.648854961832061,
  0.3333333333333333,
  0.4297520661157025,
  0.3448275862068966,
  0.4878048780487805,
  0.43478260869565216,
  0.359375,
  0.28272251308900526,
  0.6507936507936508,
  0.4444444444444444,
  0.38461538461538464,
  0.28205128205128205,
  0.3511450381679389,
  0.28421052631578947,
  0.45555555555555555,
  0.4166666666666667,
  0.375,
  0.3880597014925373,
  0.38144329896907214,
  0.39080459770114945,
  0.47692307692307695,
  0.32323232323232326,
  0.34523809523809523,
  0.3490566037735849,
  0.38095238095238093,
  0.3170731707317073,
  0.463768115942029,
  0.3655913978494624,
  0.4605263157894737,
  0.5645161290322581,
  0.36619718309859156,


In [10]:
results_df = pd.DataFrame(results)
results_df.mean()

LogisticRegression    0.402178
RandomForest          0.367282
DecisionTree          0.360081
MLPClassifier         0.370833
dtype: float64