# Average over users for each item

Idea: impute missing values by taking average rating over users for each item

In [2]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

## Load training data

In [3]:
# X has dim (USER_COUNT x ITEM_COUNT)
USER_COUNT = 10000
ITEM_COUNT = 1000

In [6]:
ratings = []
with open('../data/data_train.csv') as file:
    file.readline() # remove header
    for line in file:
        key, value = line.split(",")
        rating = int(value)
        row_string, col_string = key.split("_")
        row = int(row_string[1:])
        col = int(col_string[1:])
        ratings.append((row-1, col-1, rating))

X = np.zeros([USER_COUNT, ITEM_COUNT])
for (row, col, rating) in ratings:
    X[row, col] = rating

print(X.shape)
print(X[0:20,0:20])

(10000, 1000)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 5. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 3. 0. 5. 0. 4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 3. 0. 0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0. 5. 0. 3. 0. 0. 0. 0. 0. 5. 2. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 5. 0. 0. 0. 0. 0. 0. 0. 4. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 3. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 5. 0. 0. 0. 3. 0. 0. 0. 5. 3. 0. 0. 0. 5. 0.]
 [0. 0. 0. 1. 0. 5. 0. 5. 0. 0. 0. 0. 0. 3. 0. 0. 0. 4. 0. 3.]
 [0. 0. 0. 0. 0. 0. 0. 5. 0. 4. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 3. 0. 0. 0. 0. 0. 0.]
 [0. 0. 4. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 5. 0. 3. 0. 0. 0. 0.]
 [0. 1. 0. 0. 4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 3. 0. 0. 4. 0. 0. 0. 0. 3. 0. 5. 0. 5. 1. 0. 3. 0.]
 [0. 0. 3. 0. 0. 5. 4. 0. 0. 0. 0. 0. 0. 

## Impute missing entries (average rating per item)

In [15]:
y = X.sum(axis=0)
rating_counts = np.count_nonzero(X, axis =0)
average_per_item = y/rating_counts
X_pred = np.copy(X)
for column in range(X.shape[1]):
    no_rating = (X[:, column] == 0)
    X_pred[no_rating, column] = average_per_item[column]
print(X_pred[0:20, 0:20])

[[3.37941176 3.50094162 3.48358586 3.93647282 3.55913113 4.68276973
  3.44661922 3.87749706 2.90196078 5.         3.27668659 3.2
  3.47954545 4.4244857  3.14275668 3.43146603 2.65807068 2.40201729
  3.49333333 2.91961415]
 [3.37941176 3.50094162 3.48358586 3.         3.55913113 5.
  3.44661922 4.         2.90196078 3.6460717  3.27668659 3.2
  3.47954545 4.4244857  3.14275668 3.43146603 2.65807068 2.40201729
  3.49333333 2.91961415]
 [3.37941176 3.50094162 3.48358586 2.         3.55913113 4.68276973
  3.44661922 3.87749706 2.90196078 3.6460717  3.27668659 3.2
  3.47954545 4.4244857  3.14275668 3.43146603 2.65807068 2.40201729
  3.49333333 2.91961415]
 [3.37941176 3.50094162 3.48358586 3.93647282 3.55913113 4.68276973
  3.44661922 3.87749706 2.90196078 3.6460717  3.27668659 3.2
  3.47954545 3.         3.14275668 3.43146603 2.65807068 2.40201729
  3.49333333 2.91961415]
 [3.37941176 2.         3.48358586 3.93647282 3.55913113 5.
  3.44661922 3.         2.90196078 3.6460717  3.27668659 3.2