In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load data
ratings_df = pd.read_csv('ratings.csv')
movies_df = pd.read_csv('movies.csv')

print("Load Dataset - Done")

# Merge data
df = pd.merge(ratings_df, movies_df, on='movieId')

# Create one-hot encoding of movie genres
genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
for genre in genres:
    df[genre] = df['genres'].apply(lambda x: 1 if genre in x else 0)

# Take the first 1000 records
df = df.head(1000)

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2)

print("Splitting Dataset - Done")

# Define logistic regression model
logreg = LogisticRegression()

# Train logistic regression model
logreg.fit(train_df[genres], train_df['rating'].apply(lambda x: 1 if x >= 3 else 0))

print("Train Logistic Regression model - Done")

# Make predictions
preds = logreg.predict(test_df[genres])

print("Making Predictions - Done")

# Evaluate model
accuracy = accuracy_score(test_df['rating'].apply(lambda x: 1 if x >= 3 else 0), preds)
precision = precision_score(test_df['rating'].apply(lambda x: 1 if x >= 3 else 0), preds)
recall = recall_score(test_df['rating'].apply(lambda x: 1 if x >= 3 else 0), preds)
f1 = f1_score(test_df['rating'].apply(lambda x: 1 if x >= 3 else 0), preds)

print("Logistic Regression model - Evaluated")
print('Accuracy: ', accuracy)
print('Precision: ', precision)
print('Recall: ', recall)
print('F1 Score: ', f1)


C:\Users\rithe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\rithe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


Load Dataset - Done
Splitting Dataset - Done
Train Logistic Regression model - Done
Making Predictions - Done
Logistic Regression model - Evaluated
Accuracy:  0.92
Precision:  0.92
Recall:  1.0
F1 Score:  0.9583333333333334
