In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

In [44]:
# Você pode baixar o conjunto de dados MovieLens 100k em https://grouplens.org/datasets/movielens/
# Carregue os arquivos 'movies.csv' e 'ratings.csv'

movies = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId')
ratings = pd.read_csv('../data/reduced/ratings_m10.csv')

In [45]:
movies = movies[['title', 'genres']]

In [46]:
movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
117529,Jurassic World,"['Action', 'Adventure', 'Drama', 'Sci-Fi', 'Th..."
122882,Mad Max: Fury Road,"['Action', 'Adventure', 'Sci-Fi', 'Thriller']"
122886,Star Wars: Episode VII - The Force Awakens,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi', '..."
139385,The Revenant,"['Adventure', 'Drama']"
134130,The Martian,"['Adventure', 'Drama', 'Sci-Fi']"
...,...,...
2116,"Lord of the Rings, The","['Adventure', 'Animation', 'Children', 'Fantasy']"
2138,Watership Down,"['Adventure', 'Animation', 'Children', 'Drama'..."
2779,Heaven Can Wait,['Comedy']
52435,How the Grinch Stole Christmas!,"['Animation', 'Comedy', 'Fantasy', 'Musical']"


In [47]:
# Junte os conjuntos de dados 'movies' e 'ratings' usando a coluna 'movieId'
data = pd.merge(ratings, movies, on='movieId')

In [48]:
# Criar uma matriz de recursos TF-IDF para a descrição do filme
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])

# Adicionar a matriz TF-IDF ao conjunto de dados
movies_tfidf = pd.DataFrame(tfidf_matrix.toarray(), index=movies.index)

# Concatenar o conjunto de dados original com a matriz TF-IDF
data = pd.concat([data, movies_tfidf], axis=1).fillna(0)


In [49]:
data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,0,1,2,3,...,11,12,13,14,15,16,17,18,19,20
0,1.0,1.0,4.0,9.649827e+08,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,5.0,1.0,4.0,8.474350e+08,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",0.000000,0.369385,0.564013,0.495978,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,7.0,1.0,4.5,1.106636e+09,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",0.000000,0.474450,0.000000,0.637051,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3,15.0,1.0,2.5,1.510578e+09,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.819299,0.000000,0.0,0.0,0.0
4,17.0,1.0,4.5,1.305696e+09,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102123,0.0,0.0,0.0,0.000000e+00,0,0,0.766598,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
104241,0.0,0.0,0.0,0.000000e+00,0,0,0.546390,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
107348,0.0,0.0,0.0,0.000000e+00,0,0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
103772,0.0,0.0,0.0,0.000000e+00,0,0,0.362496,0.407530,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.463801,0.0,0.0,0.0


In [50]:
#data.columns = data.columns.astype(str)

In [51]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 75250 entries, 0 to 102481
Data columns (total 27 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     75250 non-null  float64
 1   movieId    75250 non-null  float64
 2   rating     75250 non-null  float64
 3   timestamp  75250 non-null  float64
 4   title      75250 non-null  object 
 5   genres     75250 non-null  object 
 6   0          75250 non-null  float64
 7   1          75250 non-null  float64
 8   2          75250 non-null  float64
 9   3          75250 non-null  float64
 10  4          75250 non-null  float64
 11  5          75250 non-null  float64
 12  6          75250 non-null  float64
 13  7          75250 non-null  float64
 14  8          75250 non-null  float64
 15  9          75250 non-null  float64
 16  10         75250 non-null  float64
 17  11         75250 non-null  float64
 18  12         75250 non-null  float64
 19  13         75250 non-null  float64
 20  14        

In [52]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres'], axis=1), data['rating'], test_size=0.2, random_state=42)

In [53]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
43081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
45496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
model = make_pipeline(LinearRegression())
model.fit(X_train, y_train)


In [55]:
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 1.0595187527979109


In [56]:
# Substitua user_id e movie_id pelos valores desejados
user_id = 1
movie_id = 1

user_movie_data = data[(data['userId'] == user_id) & (data['movieId'] == movie_id)].drop(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres'], axis=1)
prediction = model.predict(user_movie_data)

print(f'Previsão de nota para o filme: {prediction[0]}')

Previsão de nota para o filme: 3.543473909225884
