In [164]:
import pandas as pd

movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv").drop(columns=["timestamp"])
links_df = pd.read_csv("links.csv")
tags_df = pd.read_csv("tags.csv")

movies_feat = movies_df.copy()
tags_feat = tags_df.copy()

movies_feat = movies_feat[["movieId", "genres"]]
movies_feat["genres"] = (
    movies_feat["genres"].fillna("").str.replace("|", " ").str.lower()
)

tags_feat = tags_feat[["movieId", "tag"]]
tags_feat["tag"] = tags_feat["tag"].fillna("").str.lower()
tags_feat = (
    tags_feat.groupby("movieId")["tag"].apply(lambda x: " ".join(set(x))).reset_index()
)

movies_tags_feat = movies_feat.merge(tags_feat, on="movieId", how="left").fillna("")
movies_tags_feat["genres_tag"] = (
    movies_tags_feat["genres"] + " " + movies_tags_feat["tag"]
)
movies_tags_feat["genres_tag"] = movies_tags_feat["genres_tag"].str.strip()

print(movies_tags_feat.columns)

Index(['movieId', 'genres', 'tag', 'genres_tag'], dtype='object')


In [165]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(movies_tags_feat["genres_tag"])

x_tfidf_df = pd.DataFrame(
    X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out()
)
x_tfidf_df.index = movies_tags_feat["movieId"].values
print(x_tfidf_df.index)

Index([     1,      2,      3,      4,      5,      6,      7,      8,      9,
           10,
       ...
       193565, 193567, 193571, 193573, 193579, 193581, 193583, 193585, 193587,
       193609],
      dtype='int64', length=9742)


In [166]:
def feat_aggregator(df, id_col, feat_col):
    agg_dict = {
        f"{id_col}_avg": (feat_col, "mean"),
        f"{id_col}_var": (feat_col, "var"),
        f"{id_col}_median": (feat_col, "median"),
        f"{id_col}_count": (feat_col, "count"),
        f"{id_col}_min": (feat_col, "min"),
        f"{id_col}_max": (feat_col, "max"),
        f"{id_col}_std": (feat_col, "std"),
        f"{id_col}_sum": (feat_col, "sum"),
    }

    df_feat = df.groupby(id_col).agg(**agg_dict).reset_index()
    return df_feat

In [167]:
users_feat = feat_aggregator(ratings_df, "userId", "rating").fillna(0)
movies_feat = feat_aggregator(ratings_df, "movieId", "rating").fillna(0)

common_feat_df = ratings_df.copy()
common_feat_df = common_feat_df[["userId", "movieId"]]

common_feat_df = common_feat_df.merge(users_feat, on="userId", how="left")
common_feat_df = common_feat_df.merge(movies_feat, on="movieId", how="left")
common_feat_df = common_feat_df.merge(
    x_tfidf_df, left_on="movieId", right_index=True, how="left"
).fillna(0)

full_df = common_feat_df.merge(
    ratings_df[["userId", "movieId", "rating"]], on=["userId", "movieId"], how="inner"
)
print(full_df.head())

   userId  movieId  userId_avg  userId_var  userId_median  userId_count  \
0       1        1    4.366379    0.640077            5.0           232   
1       1        3    4.366379    0.640077            5.0           232   
2       1        6    4.366379    0.640077            5.0           232   
3       1       47    4.366379    0.640077            5.0           232   
4       1       50    4.366379    0.640077            5.0           232   

   userId_min  userId_max  userId_std  userId_sum  ...  you  younger  your  \
0         1.0         5.0    0.800048      1013.0  ...  0.0      0.0   0.0   
1         1.0         5.0    0.800048      1013.0  ...  0.0      0.0   0.0   
2         1.0         5.0    0.800048      1013.0  ...  0.0      0.0   0.0   
3         1.0         5.0    0.800048      1013.0  ...  0.0      0.0   0.0   
4         1.0         5.0    0.800048      1013.0  ...  0.0      0.0   0.0   

   zellweger  zither  zoe  zombie  zombies  zooey  rating  
0        0.0     0.0

In [168]:
feat_cols = [
    col for col in full_df.columns if col not in ["userId", "movieId", "rating"]
]
X = full_df[feat_cols]
y = full_df["rating"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [169]:
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)

y_pred = model_lr.predict(X_test_scaled)

In [170]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse_lr)

0.8184285808248377


In [171]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    n_estimators=100, learning_rate=0.1, random_state=42, max_depth=6
)
xgb_model.fit(X_train_scaled, y_train)
y_pred = xgb_model.predict(X_test_scaled)

rmse_xg = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse_xg)

0.7864967120208497
