<a href="https://colab.research.google.com/github/chandan9t8/UnivProjects/blob/main/Matrix%20Factorization/MF_GradDesc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
!pwd

/content


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_path = '/content/drive/MyDrive/Data/ratings.csv'
ratings_data = pd.read_csv(dataset_path)

In [None]:
train_data, validation_data = train_test_split(ratings_data, test_size=0.2, random_state=42)

In [None]:
# ratings_data['recommendation-score'].value_counts()

4.0    19427
3.0    12899
5.0     9631
3.5     7209
4.5     5237
2.0     4732
2.5     2921
1.0     1984
1.5     1098
0.5      579
Name: recommendation-score, dtype: int64

In [None]:
user_item_matrix = pd.pivot_table(ratings_data, values='recommendation-score', index='user-id', columns='movie-id')

In [None]:
def matrix_factorization_gradient_descent(ratings_data, num_factors, learning_rate, epochs, regularization_rate):

  num_users = ratings_data.shape[0]
  num_items = ratings_data.shape[1]

  U = 0.01 * np.random.rand(num_users, num_factors)
  V = 0.01 * np.random.rand(num_items, num_factors)

  cost_values = []

  for epoch in range(epochs):

    total_error = 0

    for i in range(num_users):

      for j in range(num_items):


        if ratings_data.iloc[i,j] > 0:

          prediction = np.dot(U[i, :], V[j, :].T)
          error = ratings_data.iloc[i, j] - prediction

          for k in range(num_factors):
            U[i, k] += 2 * learning_rate * (error * V[j, k] - regularization_rate * U[i, k])
            V[j, k] += 2 * learning_rate * (error * U[i, k] - regularization_rate * V[j, k])

          total_error += (error**2 + regularization_rate * (U[i, k]**2 + V[j, k]**2))

    cost_values.append(total_error)

    if epoch % 10 == 0:
      print(f"Epoch {epoch}/{epochs}, Cost: {total_error}")

  # Plot convergence
  plt.plot(cost_values)
  plt.xlabel('Epoch')
  plt.ylabel('Cost')
  plt.title('Convergence of Cost Function')
  plt.show()

  return U, V, cost_values

In [None]:
# Hyper-parameters
num_factors=10
learning_rate=0.01
epochs=500
regularization_rate= 0.05

In [None]:
user_item_matrix

movie-id,0,1,2,3,4,5,6,7,8,9,...,2946,2947,2948,2949,2950,2951,2952,2953,2954,2955
user-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.5,,,,,2.5,2.0,,,3.0,...,,4.5,,4.0,,3.5,2.5,,4.5,
1,4.0,4.0,3.0,,3.0,1.0,3.0,,3.0,3.0,...,,,,,,,,,,
2,,3.0,3.0,,3.0,,,,4.0,,...,1.5,4.0,,,2.0,,,,,
3,2.0,2.0,,,4.5,4.0,,,3.0,2.5,...,3.0,3.5,,1.0,3.0,2.5,3.0,3.5,3.5,3.0
4,5.0,2.5,,,,4.5,,,3.0,,...,,4.5,,,,4.0,3.5,,3.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,,,,,,,,,,,...,,,,,,,,,,
302,4.0,,,,,,,,,,...,,,,,,,,,,
303,5.0,3.5,,,5.0,,,,,,...,,,,,,,,,,
304,3.0,,,,,,,,,,...,,,,,,,,,,


In [None]:
U, V, cost_values = matrix_factorization_gradient_descent(user_item_matrix.fillna(0), num_factors, learning_rate, epochs, regularization_rate)

Epoch 0/500, Cost: 303952.8143761662
Epoch 10/500, Cost: 50071.22002094089
Epoch 20/500, Cost: 40760.0564487793
Epoch 30/500, Cost: 32787.29069103156
Epoch 40/500, Cost: 29695.024922280187
Epoch 50/500, Cost: 28390.159200428483
Epoch 60/500, Cost: 27680.94930880234
Epoch 70/500, Cost: 27234.545016848475
Epoch 80/500, Cost: 26927.14170493117
Epoch 90/500, Cost: 26702.624686634226
Epoch 100/500, Cost: 26531.801166835285
Epoch 110/500, Cost: 26397.859019377986
Epoch 120/500, Cost: 26290.336181683888
Epoch 130/500, Cost: 26202.33167496022
Epoch 140/500, Cost: 26129.098056818326
Epoch 150/500, Cost: 26067.269055382007
Epoch 160/500, Cost: 26014.401650887678
Epoch 170/500, Cost: 25968.68794369347
Epoch 180/500, Cost: 25928.766512561633
Epoch 190/500, Cost: 25893.59557969807
Epoch 200/500, Cost: 25862.365983269752
Epoch 210/500, Cost: 25834.4404126509
Epoch 220/500, Cost: 25809.310318194013
Epoch 230/500, Cost: 25786.564937181327
Epoch 240/500, Cost: 25765.868763730585
Epoch 250/500, Cost: 25

In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/Data/test.csv')

In [None]:
# prediction

predicted_scores = []

for _, row in test_data.iterrows():

    user_id = int(row['user-id'])
    movie_id = int(row['movie-id'])
    score = round(np.dot(U[user_id], V[movie_id].T) * 2) / 2
    if score > 5:
      score = 5
    elif score < 0.5:
      score = 0.5
    predicted_scores.append(score)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(19, 635)
(62, 2004)
(21, 1274)
(46, 32)
(32, 105)
(11, 2269)
(62, 91)
(102, 2424)
(48, 416)
(4, 2932)
(202, 804)
(244, 822)
(3, 2321)
(110, 405)
(226, 2049)
(110, 879)
(32, 1329)
(160, 890)
(35, 1463)
(14, 706)
(82, 49)
(223, 5)
(87, 680)
(106, 2004)
(14, 32)
(53, 1438)
(136, 2762)
(140, 230)
(114, 946)
(91, 2112)
(183, 1770)
(150, 668)
(188, 84)
(40, 2062)
(22, 678)
(13, 781)
(7, 2347)
(16, 1687)
(165, 708)
(12, 754)
(122, 1529)
(22, 1671)
(193, 585)
(206, 2149)
(46, 91)
(177, 293)
(240, 213)
(9, 1307)
(221, 232)
(45, 2379)
(299, 2366)
(4, 1763)
(24, 2132)
(206, 596)
(179, 2901)
(60, 594)
(59, 129)
(40, 821)
(76, 1437)
(18, 967)
(175, 2349)
(60, 1867)
(112, 458)
(243, 1482)
(171, 164)
(73, 822)
(63, 1830)
(77, 403)
(3, 1925)
(4, 2753)
(18, 816)
(14, 1466)
(142, 821)
(138, 1809)
(98, 628)
(93, 2257)
(31, 367)
(51, 363)
(156, 1517)
(8, 319)
(103, 2064)
(8, 261)
(1, 1229)
(226, 2638)
(14, 2474)
(94, 1784)
(7, 2808)
(5, 239

In [None]:
# from collections import Counter

# item_counts = Counter(predicted_scores)

# for item, count in item_counts.items():
#     print(f"{item}: {count} times")


4.0: 1811 times
3.5: 2301 times
3.0: 1583 times
4.5: 573 times
2.5: 713 times
2.0: 213 times
5.0: 64 times
1.5: 39 times
1.0: 3 times
5.5: 2 times


In [None]:
%%capture c
for i in predicted_scores:
  print(i)

with open('/content/drive/MyDrive/Data/graddesc.txt', 'w') as f:
  f.write(c.stdout)