In [2]:
import pandas as pd
import configparser
import pymysql
from sqlalchemy import create_engine
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import warnings; warnings.simplefilter('ignore')

In [3]:
parser = configparser.ConfigParser()
parser.read("config\mysql.conf")
hostname = parser.get("mysql_config", "hostname")
username = parser.get("mysql_config", "username")
password = parser.get("mysql_config", "password")
dbname = parser.get("mysql_config", "database")
port = parser.get("mysql_config", "port")

conn_str = 'mysql+pymysql://' + username + ':' + password + '@' + hostname + '/' + dbname
db_connection = create_engine(conn_str)

Wyczytanie danych z MySQL

In [4]:
ratings = pd.read_sql('SELECT * FROM movies_dwh.v_ml_ratings', con = db_connection)
ratings.drop(['rating_date', 'rating_time'], axis=1, inplace=True)

Macierz ocen

In [5]:
ratings_matrix = ratings.pivot(index='movie_id', columns='user_id', values='rating')
ratings_matrix = ratings_matrix.fillna(0)
#X = ratings_matrix.values

In [6]:
def normalize_row(row):
    return row - row.mean()
normalized_ratings = ratings_matrix.apply(normalize_row, axis=1)
X = normalized_ratings.values

In [7]:
pearson = normalized_ratings.T.corr(method='pearson')

In [8]:
def get_top_5_second_to_sixth_with_labels(row):
    largest = row.nlargest(6).iloc[1:6]
    return [(movie_id, similarity) for movie_id, similarity in zip(largest.index, largest.values)]
new_df = pearson.apply(get_top_5_second_to_sixth_with_labels, axis=1, result_type='expand')
result_pearson = new_df.stack().reset_index()
result_pearson.columns = ['movie_id', 'similar_movies', 'similarity']
result_pearson[['similar_movies', 'similarity']] = result_pearson['similarity'].apply(lambda x: pd.Series(x))
result_pearson
#result_pearson.drop('level_1', axis=1, inplace=True)
#result_pearson

Unnamed: 0,movie_id,similar_movies,similarity
0,1,3114.0,0.474141
1,1,2355.0,0.393799
2,1,1265.0,0.372371
3,1,4886.0,0.366277
4,1,780.0,0.356876
...,...,...,...
45325,163949,2056.0,1.000000
45326,163949,3121.0,1.000000
45327,163949,3292.0,1.000000
45328,163949,3377.0,1.000000


In [9]:
#ratings[ratings['user_id'] == 1].sort_values(by='rating', ascending=False)
idx = ratings.groupby('user_id')['rating'].idxmax()
ratings_max = ratings.loc[idx]
final_recom = pd.DataFrame(columns=['user_id', 'movie_id', 'rating', 'similar_movies', 'similarity'])
for user_id in ratings['user_id'].unique():
    recom = pd.merge(ratings_max, result_pearson, how='left', on='movie_id')
    final_recom = pd.concat([final_recom, recom], axis=0)
final_recom

Unnamed: 0,user_id,movie_id,rating,similar_movies,similarity
0,1,1172,4.0,2065.0,0.548839
1,1,1172,4.0,2973.0,0.410986
2,1,1172,4.0,3307.0,0.405256
3,1,1172,4.0,2352.0,0.402959
4,1,1172,4.0,3504.0,0.401994
...,...,...,...,...,...
3350,671,1,5.0,3114.0,0.474141
3351,671,1,5.0,2355.0,0.393799
3352,671,1,5.0,1265.0,0.372371
3353,671,1,5.0,4886.0,0.366277


In [10]:
final_recom.tail(25)

Unnamed: 0,user_id,movie_id,rating,similar_movies,similarity
3330,667,32,5.0,47.0,0.37676
3331,667,32,5.0,6.0,0.367041
3332,667,32,5.0,25.0,0.353743
3333,667,32,5.0,780.0,0.350436
3334,667,32,5.0,648.0,0.328075
3335,668,296,5.0,47.0,0.499364
3336,668,296,5.0,593.0,0.460709
3337,668,296,5.0,50.0,0.449285
3338,668,296,5.0,1089.0,0.438222
3339,668,296,5.0,1213.0,0.431997


In [9]:
def predict_ratings(similarity_df, ratings_df, common_movies):
    predictions = []

    for user_id in common_movies.keys():
        similar_users = similarity_df[similarity_df['user_id'] == user_id]['similar_users'].unique()
        user_ratings = ratings_df[ratings_df['user_id'].isin(similar_users)]

        for movie_id in common_movies[user_id]:
            similar_users_ratings = user_ratings[user_ratings['movie_id'] == movie_id]
            if not similar_users_ratings.empty:
                predicted_rating = similar_users_ratings['rating'].mean()
                predictions.append({'user_id': user_id, 'movie_id': movie_id, 'predicted_rating': predicted_rating})

    return pd.DataFrame(predictions)

predicted_ratings_df = predict_ratings(result_pearson, ratings, common_movies)
print("Predicted Ratings DataFrame:")
print(predicted_ratings_df.head(15))


NameError: name 'common_movies' is not defined

In [11]:
final_recom.drop_duplicates(inplace=True)

In [12]:
ratings_to_insert = final_recom.values.tolist()
ratings_to_insert

[[1, 1172, 4.0, 2065.0, 0.5488387167438654],
 [1, 1172, 4.0, 2973.0, 0.410985654851515],
 [1, 1172, 4.0, 3307.0, 0.405255825807544],
 [1, 1172, 4.0, 2352.0, 0.4029594187372175],
 [1, 1172, 4.0, 3504.0, 0.4019939257893162],
 [2, 17, 5.0, 1357.0, 0.44053075683982335],
 [2, 17, 5.0, 515.0, 0.39764909775860335],
 [2, 17, 5.0, 265.0, 0.38393684323949745],
 [2, 17, 5.0, 11.0, 0.36455560132789433],
 [2, 17, 5.0, 222.0, 0.36316353229060344],
 [3, 318, 5.0, 296.0, 0.3984590731800518],
 [3, 318, 5.0, 593.0, 0.388599342471766],
 [3, 318, 5.0, 50.0, 0.3779979383251654],
 [3, 318, 5.0, 356.0, 0.35509204420389284],
 [3, 318, 5.0, 1704.0, 0.34077401018389525],
 [4, 34, 5.0, 595.0, 0.41253823738249923],
 [4, 34, 5.0, 588.0, 0.4021425195591779],
 [4, 34, 5.0, 590.0, 0.3690169005603854],
 [4, 34, 5.0, 364.0, 0.3629181623649794],
 [4, 34, 5.0, 39.0, 0.3618153628548001],
 [5, 597, 5.0, 587.0, 0.6430233018850273],
 [5, 597, 5.0, 539.0, 0.633271076006994],
 [5, 597, 5.0, 500.0, 0.5722073915977209],
 [5, 597

In [13]:
conn = pymysql.connect(
    host=hostname,
    user=username,
    password=password,
    db=dbname,
    port=int(port)
)

if conn is None:
    print("Error. Connection to MySQL cannot be established.")
else:
    print("Successfully connected to MySQL.")

cur = conn.cursor()

Successfully connected to MySQL.


In [14]:
create_table = """
CREATE TABLE IF NOT EXISTS movies_dwh.ml_result_movie_movie (
    user_id INT,
    movie_id INT,
    predicted_rating FLOAT,
    similar_movies INT,
    similarity FLOAT
);
"""

insert_query = """
INSERT INTO movies_dwh.ml_result_movie_movie (user_id, movie_id, predicted_rating, similar_movies, similarity)
VALUES (%s, %s, %s, %s, %s);
"""

In [15]:
cur.execute(create_table)

0

In [16]:
# Wstawienie danych do tabeli
cur.executemany(insert_query, ratings_to_insert)

# Zatwierdzenie zmian w bazie danych
conn.commit()

# Zamknięcie kursora i połączenia
cur.close()
conn.close()

Walidacja

strona 213

In [41]:
val = ratings
pearson_val = pearson
def predict_ratings(user_id, movie_id):
    if movie_id not in ratings_matrix.columns or movie_id not in ratings_matrix.index:
        return np.nan

    # Znajdź podobnych użytkowników
    similar_movies = pearson_val[movie_id].sort_values(ascending=False).index[1:6]

    # Sprawdź, czy użytkownik ocenił podobne filmy
    if not ratings_matrix.loc[user_id, similar_movies].any():
        return np.nan

    # Wyznacz średnią ocenę od podobnych użytkowników
    similar_ratings = ratings_matrix.loc[user_id, similar_movies]
    similar_ratings = similar_ratings[similar_ratings > 0]  # Usuń brakujące oceny

    if similar_ratings.empty:
        return np.nan

    return similar_ratings.mean()

# Przewidywanie ocen dla testowych danych
val['predicted_rating'] = val.apply(lambda row: predict_ratings(row['user_id'], row['movie_id']), axis=1)

# Usuń wiersze z brakującymi przewidywaniami
val = val.dropna(subset=['predicted_rating'])
val

KeyError: '[1357] not in index'

In [None]:
    if movie_id not in ratings_matrix.columns or movie_id not in ratings_matrix.index:
        return np.nan

    # Znajdź podobnych użytkowników
    similar_movies = pearson_val[movie_id].sort_values(ascending=False).index[1:6]

    # Sprawdź, czy użytkownik ocenił podobne filmy
    if not ratings_matrix.loc[user_id, similar_movies].any():
        return np.nan

    # Wyznacz średnią ocenę od podobnych użytkowników
    similar_ratings = ratings_matrix.loc[user_id, similar_movies]
    similar_ratings = similar_ratings[similar_ratings > 0]  # Usuń brakujące oceny

    if similar_ratings.empty:
        return np.nan

    return similar_ratings.mean()

In [45]:
 similar_ratings = ratings_matrix.loc[user_id, similar_movies]
similar_ratings

movie_id
500    0.0
317    0.0
364    0.0
158    0.0
367    0.0
Name: 671, dtype: float64

In [36]:
predicted_ratings_df = val[['user_id', 'movie_id', 'predicted_rating']]
#predicted_ratings_df.to_csv('predicted_ratings.csv', index=False)


KeyError: "['predicted_rating'] not in index"

In [93]:
# Rzeczywiste i przewidywane oceny
y_true = val['rating']
y_pred = val['predicted_rating']

# Obliczanie RMSE
rmse = sqrt(mean_squared_error(y_true, y_pred))
print(f'RMSE: {rmse}')

# Obliczanie MAE
mae = mean_absolute_error(y_true, y_pred)
print(f'MAE: {mae}')


RMSE: 1.057754192515523
MAE: 0.8013428690459659
