# **Matrix Factorization**

## Loading The Data

In [1]:
names = ['user_id', 'item_id', 'rating', 'timestamp']

In [9]:
df = pd.read_csv("ml-100k/u.data", sep = '\t', names = names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [10]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]

print(f'Number of:\n -Users: {n_users}\n -Items: {n_items}')

Number of:
 -Users: 943
 -Items: 1682


## Feature Engineering

Creating the user-item matrix

In [11]:
M = np.zeros((n_users, n_items))

we subtract 1 of the user and item id's bacause they start at 1, so to is needed to mach with the numpy array index

In [12]:
for row in df.itertuples(index = False):
    
    M[row[0]-1, row[1]-1] = row[2]

In [13]:
M

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [14]:
sparsity = len(M.nonzero()[0])
sparsity /= (M.shape[0] * M.shape[1])

sparsity *= 100

print(f'Sparsity: {sparsity:.2f}%')

Sparsity: 6.30%


## Creating The Train And Test Datasets

In [16]:
train, test = train_test_split(M)

## **Functions**

In [17]:
def get_mean_squared_error(pred, actual):
    
    # Ignoring the nonzero terms.
    
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    
    return mean_squared_error(pred, actual)
    

In [15]:
def train_test_split(M):
    
    train = M.copy()
    test = np.zeros((M.shape[0], M.shape[1]))
    
    for user in np.arange(M.shape[0]):
        
        test_ratings = np.random.choice( M[user, :].nonzero()[0], size = 10, replace = False)
        
        train[user, test_ratings] = 0
        test[user, test_ratings] = M[user, test_ratings]
        
        assert(np.all((train * test) == 0))
        
        return train, test

## Imports 

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
import zipfile

In [6]:
from sklearn.metrics import mean_squared_error

In [7]:
import matplotlib.pyplot as plt

In [8]:
import seaborn as sns