# Average over users for each item

Idea: impute missing values by taking average rating over users for each item

In [1]:
import numpy as np

from data_handler import load_train_data, write_submission, get_prediction_ratings_from_matrix

## Load training data

In [2]:
X = load_train_data()
print(X[0:10,0:10])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 5.]
 [0. 0. 0. 3. 0. 5. 0. 4. 0. 0.]
 [0. 0. 0. 2. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 2. 0. 0. 0. 5. 0. 3. 0. 0.]
 [0. 0. 0. 0. 0. 5. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 5. 0. 0. 0. 3.]
 [0. 0. 0. 1. 0. 5. 0. 5. 0. 0.]]


## For each item, predict its missing entries by the average of that item's available ratings

In [3]:
# y is a vector containg the sums of each item's available ratings
y = X.sum(axis=0)
rating_counts = np.count_nonzero(X, axis=0)
average_per_item = y/rating_counts
X_pred = np.copy(X)
for column in range(X.shape[1]):
    no_rating = (X[:, column] == 0)
    X_pred[no_rating, column] = average_per_item[column]
print(X_pred[0:10, 0:10])

[[3.3794117 3.5009415 3.4835858 3.936473  3.5591311 4.68277   3.4466193
  3.877497  2.9019608 5.       ]
 [3.3794117 3.5009415 3.4835858 3.        3.5591311 5.        3.4466193
  4.        2.9019608 3.6460717]
 [3.3794117 3.5009415 3.4835858 2.        3.5591311 4.68277   3.4466193
  3.877497  2.9019608 3.6460717]
 [3.3794117 3.5009415 3.4835858 3.936473  3.5591311 4.68277   3.4466193
  3.877497  2.9019608 3.6460717]
 [3.3794117 2.        3.4835858 3.936473  3.5591311 5.        3.4466193
  3.        2.9019608 3.6460717]
 [3.3794117 3.5009415 3.4835858 3.936473  3.5591311 5.        3.4466193
  3.877497  2.9019608 3.6460717]
 [3.3794117 3.5009415 3.4835858 3.936473  3.5591311 4.68277   3.4466193
  3.877497  2.9019608 3.6460717]
 [3.3794117 3.5009415 3.4835858 3.936473  3.5591311 4.68277   3.4466193
  3.877497  2.9019608 3.6460717]
 [3.3794117 3.5009415 3.4835858 3.936473  3.5591311 5.        3.4466193
  3.877497  2.9019608 3.       ]
 [3.3794117 3.5009415 3.4835858 1.        3.5591311 5. 

## Output submission file 

In [4]:
ratings = get_prediction_ratings_from_matrix(X_pred)
write_submission(ratings, 'submission_average_over_users_0.csv')