<a href="https://colab.research.google.com/github/ekshustova/ekshustova/blob/main/Recsys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from typing import Callable, List

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as scs
from numpy.linalg import norm

In [None]:
# Скачиваем архив с данными MovieLens 100k
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip -O ml-100k.zip

# Распаковываем архив в папку data
!unzip -uo ml-100k.zip -d data

# Приводим файл с рейтингами к формату CSV
!awk 'BEGIN { print "user,item,values,timestamp" } { gsub("\\t", ",", $0); print $0 }' data/ml-100k/u.data > data/ml-100k/ratings.csv

# Приводим файл с фильмами к формату CSV
!awk 'BEGIN { print "item,title,genres" } { gsub("\\|", ",", $0); print $0 }' data/ml-100k/u.item > data/ml-100k/movies.csv


--2025-01-18 10:24:39--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2025-01-18 10:24:40 (9.92 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: data/ml-100k/
  inflating: data/ml-100k/allbut.pl  
  inflating: data/ml-100k/mku.sh     
  inflating: data/ml-100k/README     
  inflating: data/ml-100k/u.data     
  inflating: data/ml-100k/u.genre    
  inflating: data/ml-100k/u.info     
  inflating: data/ml-100k/u.item     
  inflating: data/ml-100k/u.occupation  
  inflating: data/ml-100k/u.user     
  inflating: data/ml-100k/u1.base    
  inflating: data/ml-100k/u1.test    
  inflating: data/ml-100k/u2.base    
  inflating: data/ml-100k/u2.test    
  inflating: data/ml-1

In [None]:
ratings = pd.read_csv("data/ml-100k/ratings.csv")
ratings["timestamp"] = pd.to_datetime(ratings["timestamp"], unit="s")
ratings.head()

Unnamed: 0,user,item,values,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


_____________________________________________________________________________


In [None]:
import numpy as np
from scipy.sparse import csr_matrix, eye
from scipy.sparse.linalg import spsolve
import pandas as pd
from scipy.sparse import coo_matrix

class iALSRecommender:
    def __init__(self, R, num_factors=10, lamb=0.1, iterations=10):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.num_factors = num_factors
        self.lamb = lamb
        self.iterations = iterations


        self.P = np.random.rand(self.num_users, self.num_factors)
        self.Q = np.random.rand(self.num_items, self.num_factors)

    def fit(self):
        """Fit the model using ALS."""
        for iteration in range(self.iterations):
            print(f"Iteration {iteration + 1}/{self.iterations}")

            # Фиксируем Q
            QTQ = self.Q.T.dot(self.Q)
            lambdaI = eye(self.num_factors) * self.lamb

            for u in range(self.num_users):
                non_zero_items = self.R[u, :].indices
                Q_u = self.Q[non_zero_items, :]
                R_u = self.R[u, non_zero_items].toarray()

                A = QTQ + Q_u.T.dot(Q_u) + lambdaI
                b = Q_u.T.dot(R_u.T).flatten()

                self.P[u, :] = spsolve(A, b)

            # Фиксируем P
            PTP = self.P.T.dot(self.P)
            lambdaI = eye(self.num_factors) * self.lamb

            for i in range(self.num_items):
                non_zero_users = self.R[:, i].indices
                P_i = self.P[non_zero_users, :]
                R_i = self.R[non_zero_users, i].toarray()

                A = PTP + P_i.T.dot(P_i) + lambdaI
                b = P_i.T.dot(R_i).flatten()

                self.Q[i, :] = spsolve(A, b)

    def predict(self):
        """Make predictions by computing the dot product of P and Q."""
        return self.P.dot(self.Q.T)

    def predict_user(self, uid):
        """Make predictions by computing the dot product of P and Q."""
        pred_ratings = self.P[uid] @ self.Q.T
        return np.argsort(pred_ratings)[::-1]

    def compute_rmse(self):
        """Compute the Root Mean Square Error (RMSE) on the known ratings."""
        predicted = self.predict()
        known_ratings = self.R.data
        predicted_ratings = predicted[self.R.nonzero()]
        mse = np.mean((known_ratings - predicted_ratings) ** 2)
        return np.sqrt(mse)



In [None]:
def train_test_split(ratings):
    train_data = []
    test_data = []

    for user, group in ratings.groupby('user'):
        group = group.sort_values('timestamp')
        train_data.append(group.iloc[:-1])
        test_data.append(group.iloc[-1])

    train_data = pd.concat(train_data)
    test_data = pd.DataFrame(test_data)

    return train_data, test_data

In [None]:
train_ratings, test_ratings = train_test_split(ratings)

In [None]:
R_train = coo_matrix((train_ratings['values'], (train_ratings['user'] - 1, train_ratings['item'] - 1))).tocsr()

In [None]:
model = iALSRecommender(R_train, num_factors=10, lamb=0.1, iterations=10)
model.fit()

predictions = model.predict()

hits = 0
for _, row in test_ratings.iterrows():
    user_id = row['user'] - 1
    true_item = row['item'] - 1

    user_predictions = predictions[user_id, :]
    top_10_items = np.argsort(user_predictions)[-10:][::-1]  # Топ-10

    if true_item in top_10_items:
        hits += 1

hit_rate = hits / len(test_ratings)
print(hit_rate)


Iteration 1/10


  self.P[u, :] = spsolve(A, b)
  self.Q[i, :] = spsolve(A, b)


Iteration 2/10
Iteration 3/10
Iteration 4/10
Iteration 5/10
Iteration 6/10
Iteration 7/10
Iteration 8/10
Iteration 9/10
Iteration 10/10
0.03817603393425239


In [None]:
from scipy.sparse.linalg import svds

In [None]:
ratings

Unnamed: 0,user,item,values,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16
...,...,...,...,...
99995,880,476,3,1997-11-22 05:10:44
99996,716,204,5,1997-11-17 19:39:03
99997,276,1090,1,1997-09-20 22:49:55
99998,13,225,2,1997-12-17 22:52:36


SVD

In [None]:
mtrx_df = ratings.pivot(index = 'user', columns ='item', values = 'values').fillna(0)

In [None]:
mtrx = mtrx_df.to_numpy()

In [None]:
U, sigma, Vt = svds(mtrx, k = 50)

In [None]:
sigma = np.diag(sigma)

In [None]:
all_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [None]:
hits = 0
for _, row in test_ratings.iterrows():
    user_id = row['user'] - 1
    true_item = row['item'] - 1

    user_predictions = all_predicted_ratings[user_id, :]
    top_10_items = np.argsort(user_predictions)[-10:][::-1]  # Топ-10

    if true_item in top_10_items:
        hits += 1

hit_rate = hits / len(test_ratings)
print(hit_rate)

0.09225874867444327
