# Recommender system

## Enviroment config

### Load mods

In [38]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV,train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

pd.set_option('display.float_format', lambda x: '%.2f' % x)

### Relevant functions

In [39]:
def get_dim(param_grid):
    return np.prod([x for x in map(len, param_grid.values())])

## Data preparation

### Load data

In [40]:
pth = '/Users/gblasd/Documents/DataScience/02_SupervisedLearning/data/movies_2/'
movies = pd.read_csv(pth+"movies.csv")
links = pd.read_csv(pth+"links.csv")
ratings = pd.read_csv(pth+"ratings.csv")
tags = pd.read_csv(pth+"tags.csv")

### EDA

In [41]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [42]:
for dataset in [movies, links, ratings, tags]:
    print(dataset.shape)
    display(dataset.sample(5))

(9742, 3)


Unnamed: 0,movieId,title,genres
6889,63222,JCVD (2008),Action|Drama|Thriller
6070,40962,"Yours, Mine and Ours (2005)",Comedy|Romance
8885,134248,Hot Girls Wanted (2015),Documentary
9071,142488,Spotlight (2015),Thriller
4251,6201,Lady Jane (1986),Drama|Romance


(45843, 3)


Unnamed: 0,movieId,imdbId,tmdbId
37661,154616,3022458,305638.0
26307,121171,3264102,256876.0
33790,143655,2521700,297018.0
43953,171631,6264596,455601.0
4782,4877,236019,27444.0


(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
53799,354,48516,4.0,1200870273
96588,603,2074,4.0,953926937
20168,132,5791,3.5,1157924181
83134,526,3448,5.0,1502133084
20999,139,44665,2.5,1453924519


(3683, 4)


Unnamed: 0,userId,movieId,tag,timestamp
152,62,37729,helena bonham carter,1530310534
2138,474,6407,Olympics,1137374882
1540,474,1996,ghosts,1137373825
2448,474,33660,boxing,1137201602
231,62,87430,cheesy,1525555170


### Feature Engineering

#### Rating global

In [43]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [44]:
aux = ratings[["userId", "rating"]].groupby("userId").agg(["min", "max", "mean"])

In [45]:
aux.head()

Unnamed: 0_level_0,rating,rating,rating
Unnamed: 0_level_1,min,max,mean
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,1.0,5.0,4.37
2,2.0,5.0,3.95
3,0.5,5.0,2.44
4,1.0,5.0,3.56
5,1.0,5.0,3.64


In [46]:
aux.columns = ["_".join(x) for x in aux.columns]

In [47]:
aux = aux.reset_index()

In [48]:
aux.head()

Unnamed: 0,userId,rating_min,rating_max,rating_mean
0,1,1.0,5.0,4.37
1,2,2.0,5.0,3.95
2,3,0.5,5.0,2.44
3,4,1.0,5.0,3.56
4,5,1.0,5.0,3.64


In [49]:
ratings = ratings.merge(aux, on="userId", how = "left")

In [50]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,rating_min,rating_max,rating_mean
0,1,1,4.0,964982703,1.0,5.0,4.37
1,1,3,4.0,964981247,1.0,5.0,4.37
2,1,6,4.0,964982224,1.0,5.0,4.37
3,1,47,5.0,964983815,1.0,5.0,4.37
4,1,50,5.0,964982931,1.0,5.0,4.37


#### Tags

In [51]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [52]:
tags = tags[["movieId", "tag"]].groupby("movieId").agg([lambda x:" ".join(x)])

In [53]:
tags.head(5)

Unnamed: 0_level_0,tag
Unnamed: 0_level_1,<lambda>
movieId,Unnamed: 1_level_2
1,pixar pixar fun
2,fantasy magic board game Robin Williams game
3,moldy old
5,pregnancy remake
7,remake


In [54]:
tags.columns = ["tags"]

In [55]:
vect = CountVectorizer(min_df=1, lowercase=True, max_features=30)

In [56]:
vect.fit(tags["tags"])

In [57]:
tags = tags.drop(columns=["tags"]).reset_index().join(pd.DataFrame(data = vect.transform(tags["tags"]).todense(), columns=vect.get_feature_names_out()))

In [58]:
tags.head()

Unnamed: 0,movieId,action,and,atmospheric,bad,black,comedy,crime,dark,disney,...,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Movies (genres)

In [59]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [60]:
movies["ls_genres"] = movies["genres"].str.split("|")

In [61]:
movies["n_genres"] = movies["ls_genres"].str.len()

In [62]:
movies.head()

Unnamed: 0,movieId,title,genres,ls_genres,n_genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",5
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",3
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",3
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],1


In [63]:
movies["n_genres"].value_counts()

n_genres
2     3218
1     2851
3     2338
4      987
5      271
6       63
7       12
8        1
10       1
Name: count, dtype: int64

In [64]:
for i in range(movies["n_genres"].max()):
    movies[f"genre_{i}"] = movies.apply(lambda x:x["ls_genres"][i] if x["n_genres"] >= i+1 else np.nan, axis = 1)
    aux = movies[["movieId", f"genre_{i}"]].rename(columns={f"genre_{i}": "genre"})
    if i == 0:
        movies_genre = aux.copy()
    else:
        movies_genre = pd.concat([movies_genre,aux])

In [65]:
movies_genre[movies_genre['movieId']==1]

Unnamed: 0,movieId,genre
0,1,Adventure
0,1,Animation
0,1,Children
0,1,Comedy
0,1,Fantasy
0,1,
0,1,
0,1,
0,1,
0,1,


In [66]:
movies_genre.sort_values(by = "movieId").dropna()

Unnamed: 0,movieId,genre
0,1,Adventure
0,1,Fantasy
0,1,Comedy
0,1,Animation
0,1,Children
...,...,...
9738,193583,Fantasy
9739,193585,Drama
9740,193587,Animation
9740,193587,Action


In [67]:
movies_genre.head(2)

Unnamed: 0,movieId,genre
0,1,Adventure
1,2,Adventure


In [68]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,rating_min,rating_max,rating_mean
0,1,1,4.0,964982703,1.0,5.0,4.37
1,1,3,4.0,964981247,1.0,5.0,4.37


In [69]:
mgr = movies_genre.dropna().merge(ratings, on="movieId", how="right")

In [70]:
mgr.head(15)

Unnamed: 0,movieId,genre,userId,rating,timestamp,rating_min,rating_max,rating_mean
0,1,Adventure,1,4.0,964982703,1.0,5.0,4.37
1,1,Animation,1,4.0,964982703,1.0,5.0,4.37
2,1,Children,1,4.0,964982703,1.0,5.0,4.37
3,1,Comedy,1,4.0,964982703,1.0,5.0,4.37
4,1,Fantasy,1,4.0,964982703,1.0,5.0,4.37
5,3,Comedy,1,4.0,964981247,1.0,5.0,4.37
6,3,Romance,1,4.0,964981247,1.0,5.0,4.37
7,6,Action,1,4.0,964982224,1.0,5.0,4.37
8,6,Crime,1,4.0,964982224,1.0,5.0,4.37
9,6,Thriller,1,4.0,964982224,1.0,5.0,4.37


In [71]:
mgr = mgr[["userId", "genre", "rating"]].pivot_table(index="userId", columns="genre", aggfunc=["min", "max", "mean"])

In [72]:
mgr.head()

Unnamed: 0_level_0,min,min,min,min,min,min,min,min,min,min,...,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,...,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
genre,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
1,,2.0,2.0,3.0,3.0,2.0,2.0,,1.0,2.0,...,5.0,3.47,,4.68,4.17,4.31,4.22,4.15,4.5,4.29
2,,3.0,3.5,,,3.0,2.0,3.0,2.0,,...,,3.0,3.75,,4.0,4.5,3.88,3.7,4.5,3.5
3,,0.5,0.5,0.5,0.5,0.5,0.5,,0.5,0.5,...,,4.69,,0.5,5.0,0.5,4.2,4.14,0.5,
4,,1.0,1.0,3.0,1.0,1.0,1.0,3.0,1.0,1.0,...,4.0,4.25,3.0,4.0,3.48,3.38,2.83,3.55,3.57,3.8
5,,2.0,2.0,3.0,3.0,2.0,3.0,,1.0,3.0,...,,3.0,3.67,4.4,4.0,3.09,2.5,3.56,3.33,3.0


In [73]:
mgr.columns = ["_".join(x) for x in mgr.columns]

In [74]:
mgr.head(5)

Unnamed: 0_level_0,min_rating_(no genres listed),min_rating_Action,min_rating_Adventure,min_rating_Animation,min_rating_Children,min_rating_Comedy,min_rating_Crime,min_rating_Documentary,min_rating_Drama,min_rating_Fantasy,...,mean_rating_Film-Noir,mean_rating_Horror,mean_rating_IMAX,mean_rating_Musical,mean_rating_Mystery,mean_rating_Romance,mean_rating_Sci-Fi,mean_rating_Thriller,mean_rating_War,mean_rating_Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,2.0,2.0,3.0,3.0,2.0,2.0,,1.0,2.0,...,5.0,3.47,,4.68,4.17,4.31,4.22,4.15,4.5,4.29
2,,3.0,3.5,,,3.0,2.0,3.0,2.0,,...,,3.0,3.75,,4.0,4.5,3.88,3.7,4.5,3.5
3,,0.5,0.5,0.5,0.5,0.5,0.5,,0.5,0.5,...,,4.69,,0.5,5.0,0.5,4.2,4.14,0.5,
4,,1.0,1.0,3.0,1.0,1.0,1.0,3.0,1.0,1.0,...,4.0,4.25,3.0,4.0,3.48,3.38,2.83,3.55,3.57,3.8
5,,2.0,2.0,3.0,3.0,2.0,3.0,,1.0,3.0,...,,3.0,3.67,4.4,4.0,3.09,2.5,3.56,3.33,3.0


In [75]:
aux = mgr.isnull().mean().sort_values().reset_index()

In [76]:
aux.head()

Unnamed: 0,index,0
0,mean_rating_Drama,0.0
1,max_rating_Drama,0.0
2,min_rating_Drama,0.0
3,min_rating_Thriller,0.0
4,mean_rating_Thriller,0.0


In [77]:
ls_keep = aux[aux[0] <= 0.35]["index"].tolist()

In [78]:
ls_keep

['mean_rating_Drama',
 'max_rating_Drama',
 'min_rating_Drama',
 'min_rating_Thriller',
 'mean_rating_Thriller',
 'max_rating_Thriller',
 'max_rating_Comedy',
 'mean_rating_Comedy',
 'min_rating_Comedy',
 'mean_rating_Action',
 'max_rating_Action',
 'min_rating_Action',
 'max_rating_Adventure',
 'mean_rating_Romance',
 'min_rating_Adventure',
 'min_rating_Romance',
 'mean_rating_Adventure',
 'max_rating_Romance',
 'min_rating_Sci-Fi',
 'max_rating_Sci-Fi',
 'mean_rating_Sci-Fi',
 'mean_rating_Crime',
 'max_rating_Crime',
 'min_rating_Crime',
 'mean_rating_Fantasy',
 'max_rating_Fantasy',
 'min_rating_Fantasy',
 'mean_rating_Mystery',
 'max_rating_Mystery',
 'min_rating_Mystery',
 'mean_rating_Children',
 'min_rating_Children',
 'max_rating_Children',
 'min_rating_War',
 'mean_rating_War',
 'max_rating_War',
 'max_rating_Horror',
 'mean_rating_Horror',
 'min_rating_Horror',
 'min_rating_Animation',
 'mean_rating_Animation',
 'max_rating_Animation',
 'max_rating_Musical',
 'mean_rating_M

In [79]:
mgr = mgr[ls_keep]

In [80]:
mgr.head()

Unnamed: 0_level_0,mean_rating_Drama,max_rating_Drama,min_rating_Drama,min_rating_Thriller,mean_rating_Thriller,max_rating_Thriller,max_rating_Comedy,mean_rating_Comedy,min_rating_Comedy,mean_rating_Action,...,max_rating_Animation,max_rating_Musical,mean_rating_Musical,min_rating_Musical,max_rating_IMAX,mean_rating_IMAX,min_rating_IMAX,mean_rating_Western,max_rating_Western,min_rating_Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.53,5.0,1.0,1.0,4.15,5.0,5.0,4.28,2.0,4.32,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0
2,3.88,5.0,2.0,2.0,3.7,5.0,5.0,4.0,3.0,3.95,...,,,,,4.5,3.75,3.0,3.5,3.5,3.5
3,0.75,4.5,0.5,0.5,4.14,5.0,5.0,1.0,0.5,3.57,...,0.5,0.5,0.5,0.5,,,,,,
4,3.48,5.0,1.0,1.0,3.55,5.0,5.0,3.51,1.0,3.32,...,5.0,5.0,4.0,1.0,3.0,3.0,3.0,3.8,5.0,2.0
5,3.8,5.0,1.0,2.0,3.56,5.0,5.0,3.47,2.0,3.11,...,5.0,5.0,4.4,3.0,5.0,3.67,3.0,3.0,5.0,1.0


In [81]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,rating_min,rating_max,rating_mean
0,1,1,4.0,964982703,1.0,5.0,4.37
1,1,3,4.0,964981247,1.0,5.0,4.37
2,1,6,4.0,964982224,1.0,5.0,4.37
3,1,47,5.0,964983815,1.0,5.0,4.37
4,1,50,5.0,964982931,1.0,5.0,4.37


In [82]:
ratings = ratings.merge(mgr.reset_index(), on="userId", how = "left")

In [83]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,rating_min,rating_max,rating_mean,mean_rating_Drama,max_rating_Drama,min_rating_Drama,...,max_rating_Animation,max_rating_Musical,mean_rating_Musical,min_rating_Musical,max_rating_IMAX,mean_rating_IMAX,min_rating_IMAX,mean_rating_Western,max_rating_Western,min_rating_Western
0,1,1,4.0,964982703,1.0,5.0,4.37,4.53,5.0,1.0,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0
1,1,3,4.0,964981247,1.0,5.0,4.37,4.53,5.0,1.0,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0
2,1,6,4.0,964982224,1.0,5.0,4.37,4.53,5.0,1.0,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0
3,1,47,5.0,964983815,1.0,5.0,4.37,4.53,5.0,1.0,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0
4,1,50,5.0,964982931,1.0,5.0,4.37,4.53,5.0,1.0,...,5.0,5.0,4.68,3.0,,,,4.29,5.0,3.0


#### TAD

In [84]:
tags.head()

Unnamed: 0,movieId,action,and,atmospheric,bad,black,comedy,crime,dark,disney,...,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
df = ratings.drop(columns=["timestamp"]).merge(tags, on = "movieId", how ="left")
del movies, links, ratings, tags

In [86]:
df.head()

Unnamed: 0,userId,movieId,rating,rating_min,rating_max,rating_mean,mean_rating_Drama,max_rating_Drama,min_rating_Drama,min_rating_Thriller,...,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
0,1,1,4.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,6,4.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,...,,,,,,,,,,
3,1,47,5.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,50,5.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [87]:
X = df[[x for x in df.columns if x not in ["userId", "movieId", "rating"]]]
y = df[["rating"]]

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)

#### Escalamiento de datos

In [None]:
#SimpleImputer?

[0;31mInit signature:[0m
[0mSimpleImputer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmissing_values[0m[0;34m=[0m[0mnan[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstrategy[0m[0;34m=[0m[0;34m'mean'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfill_value[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcopy[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0madd_indicator[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeep_empty_features[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Univariate imputer for completing missing values with simple strategies.

Replace missing values using a descriptive statistic (e.g. mean, median, or
most frequent) along each column, or using a constant value.

Read more in the :ref:`User Guide <impute>`.

.. versionadded:: 0.20
   `Sim

In [90]:
im_x = SimpleImputer(strategy="constant", fill_value=-1)

In [91]:
sc_x = MinMaxScaler()
sc_y = MinMaxScaler()

In [92]:
X_train = im_x.fit_transform(X_train)

In [93]:
Xs = sc_x.fit_transform(X_train)
ys = sc_y.fit_transform(y_train)

## Model

### k-nearest neighbors algorithm

#### Cross validation

In [94]:
#KNeighborsRegressor?

In [95]:
knn = KNeighborsRegressor(n_neighbors=100)# se probó con 5 también
knn.fit(Xs, ys)

In [96]:
ls_scores = cross_val_score(X=Xs, y=ys, cv=4, n_jobs=-1, estimator=knn, scoring="r2")

In [97]:
np.mean(ls_scores), np.std(ls_scores)

(np.float64(0.21519314619757499), np.float64(0.006684736400878095))

#### Hyperparametrización

In [98]:
param_grid = {"n_neighbors": range(5, 105, 5),
              "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
              "metric": ["euclidean", "manhattan", "chebyshev", "minkowski", "seuclidean", "mahalanobis", ]}

In [99]:
dim = get_dim(param_grid)
dim

np.int64(480)

In [100]:
rs = RandomizedSearchCV(cv=4, error_score=-1000, estimator=knn, n_jobs=-1, scoring="r2", param_distributions=param_grid, verbose=5, n_iter=int(dim*0.1))

In [101]:
rs.fit(Xs, ys)

Fitting 4 folds for each of 48 candidates, totalling 192 fits
[CV 1/4] END algorithm=brute, metric=euclidean, n_neighbors=85;, score=0.213 total time=   3.2s
[CV 1/4] END algorithm=kd_tree, metric=seuclidean, n_neighbors=85;, score=-1000.000 total time=   0.0s
[CV 2/4] END algorithm=kd_tree, metric=seuclidean, n_neighbors=85;, score=-1000.000 total time=   0.0s
[CV 3/4] END algorithm=kd_tree, metric=seuclidean, n_neighbors=85;, score=-1000.000 total time=   0.0s
[CV 4/4] END algorithm=kd_tree, metric=seuclidean, n_neighbors=85;, score=-1000.000 total time=   0.0s
[CV 2/4] END algorithm=brute, metric=euclidean, n_neighbors=85;, score=0.218 total time=   3.8s
[CV 4/4] END algorithm=brute, metric=euclidean, n_neighbors=85;, score=0.223 total time=   3.7s
[CV 3/4] END algorithm=brute, metric=euclidean, n_neighbors=85;, score=0.207 total time=   3.7s
[CV 2/4] END algorithm=brute, metric=manhattan, n_neighbors=85;, score=0.223 total time=  47.7s
[CV 4/4] END algorithm=brute, metric=manhattan

Traceback (most recent call last):
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ~~~~~~~~~~~~~~~~~~~~^
        estimator, *arg

[CV 2/4] END algorithm=kd_tree, metric=seuclidean, n_neighbors=75;, score=-1000.000 total time=   0.0s
[CV 3/4] END algorithm=kd_tree, metric=seuclidean, n_neighbors=75;, score=-1000.000 total time=   0.0s
[CV 4/4] END algorithm=kd_tree, metric=seuclidean, n_neighbors=75;, score=-1000.000 total time=   0.0s
[CV 1/4] END algorithm=auto, metric=seuclidean, n_neighbors=80;, score=-1000.000 total time=   0.0s
[CV 2/4] END algorithm=auto, metric=seuclidean, n_neighbors=80;, score=-1000.000 total time=   0.0s
[CV 3/4] END algorithm=auto, metric=seuclidean, n_neighbors=80;, score=-1000.000 total time=   0.0s
[CV 4/4] END algorithm=auto, metric=seuclidean, n_neighbors=80;, score=-1000.000 total time=   0.0s


Traceback (most recent call last):
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ~~~~~~~~~~~~~~~~~~~~^
        estimator, *arg

[CV 3/4] END algorithm=brute, metric=euclidean, n_neighbors=50;, score=0.207 total time=   3.9s
[CV 2/4] END algorithm=brute, metric=euclidean, n_neighbors=50;, score=0.216 total time=   4.6s
[CV 4/4] END algorithm=brute, metric=euclidean, n_neighbors=50;, score=0.220 total time=   3.4s
[CV 2/4] END algorithm=auto, metric=manhattan, n_neighbors=60;, score=0.221 total time=  50.1s
[CV 1/4] END algorithm=auto, metric=manhattan, n_neighbors=60;, score=0.214 total time=  50.7s
[CV 3/4] END algorithm=auto, metric=manhattan, n_neighbors=60;, score=0.210 total time=  50.5s
[CV 4/4] END algorithm=auto, metric=manhattan, n_neighbors=60;, score=0.225 total time=  51.1s
[CV 1/4] END algorithm=kd_tree, metric=chebyshev, n_neighbors=5;, score=0.039 total time=  39.9s
[CV 1/4] END algorithm=kd_tree, metric=mahalanobis, n_neighbors=10;, score=-1000.000 total time=   0.0s
[CV 2/4] END algorithm=kd_tree, metric=mahalanobis, n_neighbors=10;, score=-1000.000 total time=   0.0s
[CV 3/4] END algorithm=kd_t

Traceback (most recent call last):
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ~~~~~~~~~~~~~~~~~~~~^
        estimator, *arg

[CV 2/4] END algorithm=auto, metric=mahalanobis, n_neighbors=85;, score=-1000.000 total time=   0.0s
[CV 3/4] END algorithm=auto, metric=mahalanobis, n_neighbors=85;, score=-1000.000 total time=   0.0s
[CV 4/4] END algorithm=auto, metric=mahalanobis, n_neighbors=85;, score=-1000.000 total time=   0.0s


Traceback (most recent call last):
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ~~~~~~~~~~~~~~~~~~~~^
        estimator, *arg

[CV 3/4] END algorithm=ball_tree, metric=minkowski, n_neighbors=35;, score=0.203 total time= 3.7min
[CV 2/4] END algorithm=ball_tree, metric=minkowski, n_neighbors=35;, score=0.214 total time= 3.7min
[CV 1/4] END algorithm=ball_tree, metric=minkowski, n_neighbors=35;, score=0.205 total time= 3.7min
[CV 2/4] END algorithm=brute, metric=manhattan, n_neighbors=95;, score=0.224 total time=  54.8s
[CV 1/4] END algorithm=brute, metric=manhattan, n_neighbors=95;, score=0.217 total time=  55.4s
[CV 4/4] END algorithm=brute, metric=manhattan, n_neighbors=95;, score=0.227 total time=  55.9s
[CV 3/4] END algorithm=brute, metric=manhattan, n_neighbors=95;, score=0.212 total time=  56.6s
[CV 2/4] END algorithm=kd_tree, metric=euclidean, n_neighbors=90;, score=0.219 total time= 1.6min
[CV 1/4] END algorithm=kd_tree, metric=euclidean, n_neighbors=90;, score=0.213 total time= 1.6min
[CV 4/4] END algorithm=kd_tree, metric=euclidean, n_neighbors=90;, score=0.224 total time= 1.6min
[CV 3/4] END algorithm

Traceback (most recent call last):
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ~~~~~~~~~~~~~~~~~~~~^
        estimator, *arg

[CV 1/4] END algorithm=brute, metric=seuclidean, n_neighbors=95;, score=-1000.000 total time=   0.0s
[CV 2/4] END algorithm=brute, metric=seuclidean, n_neighbors=95;, score=-1000.000 total time=   0.0s
[CV 3/4] END algorithm=brute, metric=seuclidean, n_neighbors=95;, score=-1000.000 total time=   0.0s
[CV 4/4] END algorithm=brute, metric=seuclidean, n_neighbors=95;, score=-1000.000 total time=   0.0s
[CV 2/4] END algorithm=auto, metric=euclidean, n_neighbors=55;, score=0.217 total time=   3.9s
[CV 3/4] END algorithm=auto, metric=euclidean, n_neighbors=55;, score=0.207 total time=   4.5s
[CV 4/4] END algorithm=auto, metric=euclidean, n_neighbors=55;, score=0.221 total time=   3.8s
[CV 1/4] END algorithm=kd_tree, metric=euclidean, n_neighbors=40;, score=0.208 total time= 1.1min
[CV 2/4] END algorithm=kd_tree, metric=euclidean, n_neighbors=40;, score=0.214 total time= 1.1min
[CV 3/4] END algorithm=kd_tree, metric=euclidean, n_neighbors=40;, score=0.204 total time= 1.2min
[CV 4/4] END algo

Traceback (most recent call last):
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ~~~~~~~~~~~~~~~~~~~~^
        estimator, *arg

[CV 2/4] END algorithm=auto, metric=manhattan, n_neighbors=5;, score=0.100 total time=  54.9s
[CV 3/4] END algorithm=auto, metric=manhattan, n_neighbors=5;, score=0.076 total time=  55.3s
[CV 4/4] END algorithm=auto, metric=manhattan, n_neighbors=5;, score=0.091 total time=  54.2s
[CV 1/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=10;, score=0.156 total time=  38.7s
[CV 2/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=10;, score=0.170 total time=  38.2s
[CV 3/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=10;, score=0.153 total time=  38.1s
[CV 4/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=10;, score=0.160 total time=  38.4s
[CV 1/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=5;, score=0.045 total time=  32.6s
[CV 2/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=5;, score=0.103 total time=  32.6s
[CV 3/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=5;, score=0.075 total time=  32.3s
[CV 4/4] END algorithm=kd_tree, met

Traceback (most recent call last):
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ~~~~~~~~~~~~~~~~~~~~^
        estimator, *arg

[CV 4/4] END algorithm=ball_tree, metric=chebyshev, n_neighbors=75;, score=0.191 total time= 5.1min
[CV 3/4] END algorithm=ball_tree, metric=chebyshev, n_neighbors=75;, score=0.175 total time= 5.1min
[CV 1/4] END algorithm=brute, metric=chebyshev, n_neighbors=35;, score=0.192 total time= 1.2min
[CV 3/4] END algorithm=brute, metric=chebyshev, n_neighbors=35;, score=0.190 total time= 1.2min
[CV 2/4] END algorithm=brute, metric=chebyshev, n_neighbors=35;, score=0.200 total time= 1.2min
[CV 4/4] END algorithm=brute, metric=chebyshev, n_neighbors=35;, score=0.200 total time= 1.2min
[CV 2/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=40;, score=0.214 total time= 1.1min
[CV 1/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=40;, score=0.208 total time= 1.1min
[CV 3/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=40;, score=0.204 total time= 1.1min
[CV 4/4] END algorithm=kd_tree, metric=minkowski, n_neighbors=40;, score=0.218 total time= 1.1min
[CV 1/4] END algorithm=a

28 fits failed out of a total of 192.
The score on these train-test partitions for these parameters will be set to -1000.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/gblasd/Documents/DataScience/.venv/lib/python3.13/site-packages/sklearn/neighbors/_regression.py", line 222, in fit
    return self._fit(X, y)
        

In [102]:
rs.best_score_

np.float64(0.22004557699825147)

In [103]:
rs.best_estimator_

### Naïve Bayes

#### Cross validation

In [104]:
yc = (y_train >= 5)*1
yc_test = (y_test >= 5)*1

In [105]:
y_train.head()

Unnamed: 0,rating
57886,5.0
53239,3.0
1822,4.5
89587,3.5
18925,5.0


In [106]:
yc.sample(5)

Unnamed: 0,rating
36083,0
14282,0
51788,0
91614,1
48642,0


In [107]:
#MultinomialNB, GaussianNB, BernoulliNB
nb = GaussianNB()
nb.fit(Xs, yc.values.ravel())

In [108]:
ls_scores = cross_val_score(X=Xs, y=yc, cv=4, n_jobs=-1, estimator=nb, scoring="roc_auc")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [109]:
np.mean(ls_scores), np.std(ls_scores)

(np.float64(0.7276800889163066), np.float64(0.0056137400893893725))

In [110]:
df.head()

Unnamed: 0,userId,movieId,rating,rating_min,rating_max,rating_mean,mean_rating_Drama,max_rating_Drama,min_rating_Drama,min_rating_Thriller,...,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
0,1,1,4.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,6,4.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,...,,,,,,,,,,
3,1,47,5.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,50,5.0,1.0,5.0,4.37,4.53,5.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Evaluation

In [111]:
df[['movieId','userId','rating']].sample(5)

Unnamed: 0,movieId,userId,rating
82952,48780,525,4.0
90961,3703,590,3.0
38242,39,263,4.0
41438,96821,280,5.0
25379,6539,177,5.0


In [112]:
df[(df["movieId"] == 119145) & (df["userId"] == 525)]

Unnamed: 0,userId,movieId,rating,rating_min,rating_max,rating_mean,mean_rating_Drama,max_rating_Drama,min_rating_Drama,min_rating_Thriller,...,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
83082,525,119145,4.5,0.5,4.5,3.54,3.68,4.5,1.5,1.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
# The imputation is missing
X.iloc[[83082]].fillna(-1)

Unnamed: 0,rating_min,rating_max,rating_mean,mean_rating_Drama,max_rating_Drama,min_rating_Drama,min_rating_Thriller,mean_rating_Thriller,max_rating_Thriller,max_rating_Comedy,...,sci,space,superhero,surreal,suspense,thought,travel,twist,visually,war
83082,0.5,4.5,3.54,3.68,4.5,1.5,1.5,3.62,4.5,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
nb.predict(sc_x.transform(X.iloc[[83082]].fillna(-1)))



array([0])

In [116]:
nb.predict(sc_x.transform(X.iloc[[83082]].fillna(-1)))



array([0])