In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

# There's a lot of columns in the DF. 
# Therefore, we add this option so that we can see more columns
pd.options.display.max_columns = 100

## Load the data

In [2]:
df = pd.read_csv('../data/data_train.csv')

In [3]:
df.head()

Unnamed: 0,Id,Prediction
0,r44_c1,4
1,r61_c1,3
2,r67_c1,4
3,r72_c1,3
4,r86_c1,5


In [4]:
df['people'] = df['Id'].apply(lambda x: int(x.split('_')[0][1:]))
df['movie'] = df['Id'].apply(lambda x: int(x.split('_')[1][1:]))

In [5]:
df.head()

Unnamed: 0,Id,Prediction,people,movie
0,r44_c1,4,44,1
1,r61_c1,3,61,1
2,r67_c1,4,67,1
3,r72_c1,3,72,1
4,r86_c1,5,86,1


In [6]:
df = df.drop('Id', axis=1)

In [7]:
peoples = df['people'].unique()
peoples.sort()

In [8]:
print(peoples)

[    1     2     3 ...,  9998  9999 10000]


In [9]:
movies = df['movie'].unique()
movies.sort()

In [10]:
print(movies)

[   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15
   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30
   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45
   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60
   61   62   63   64   65   66   67   68   69   70   71   72   73   74   75
   76   77   78   79   80   81   82   83   84   85   86   87   88   89   90
   91   92   93   94   95   96   97   98   99  100  101  102  103  104  105
  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120
  121  122  123  124  125  126  127  128  129  130  131  132  133  134  135
  136  137  138  139  140  141  142  143  144  145  146  147  148  149  150
  151  152  153  154  155  156  157  158  159  160  161  162  163  164  165
  166  167  168  169  170  171  172  173  174  175  176  177  178  179  180
  181  182  183  184  185  186  187  188  189  190  191  192  193  194  195
  196  197  

We see here that we have 10000 persons and 1000 movies. Let's create the big matrix! =)

In [11]:
predictions = -1*np.ones((len(movies), len(peoples)))
predictions.shape

(1000, 10000)

In [12]:
for i in range(len(peoples)):
    if (i+1)%1000 == 0:
        print("%i/%i"%(i+1, len(peoples)))
    df_people = df[df['people'] == i+1]['Prediction']
    df_people.index = df[df['people'] == i+1]['movie']-1
    predictions[df_people.index, i] = df_people

1000/10000
2000/10000
3000/10000
4000/10000
5000/10000
6000/10000
7000/10000
8000/10000
9000/10000
10000/10000


In [13]:
df_mat = pd.DataFrame(predictions)

In [14]:
df_mat.shape

(1000, 10000)

In [15]:
df_mat[df_mat == -1] = None

In [16]:
df_mat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,9950,9951,9952,9953,9954,9955,9956,9957,9958,9959,9960,9961,9962,9963,9964,9965,9966,9967,9968,9969,9970,9971,9972,9973,9974,9975,9976,9977,9978,9979,9980,9981,9982,9983,9984,9985,9986,9987,9988,9989,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,...,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,
1,,,,,2.0,,,,,,,,,1.0,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,3.0,,,4.0,,,,,,,...,,,,,,,,,,,,,,,,,,3.0,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,4.0,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,3.0,,,,...,5.0,,4.0,,,,,,,,,,3.0,4.0,4.0,,,,,3.0,,,5.0,,,3.0,,2.0,5.0,,,,,,,,,,,,,,4.0,,,,,,,
3,,3.0,2.0,,,,,,,1.0,,,,,3.0,,,5.0,4.0,3.0,,,3.0,,,,5.0,5.0,,5.0,,,,,,5.0,5.0,,,,,,,5.0,,5.0,4.0,,,3.0,...,3.0,5.0,,,3.0,,,,,3.0,,5.0,5.0,5.0,2.0,4.0,,,,,5.0,5.0,,,,2.0,,2.0,,2.0,4.0,,,,,,5.0,5.0,,4.0,2.0,,2.0,,3.0,,,,5.0,
4,,,,,,,,,,,,,,4.0,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,3.0,,,5.0,,,3.0,,,,...,,,,,,,,,,,3.0,,,,,,,,,,,,,,,3.0,,3.0,,,,,,,2.0,,,,,,,,,,,,,,,


In [17]:
mean = np.zeros(df_mat.shape[0])

for i in range(df_mat.shape[0]):
    mean[i] = df_mat.loc[i,:].mean()

In [18]:
df_mat['mean'] = mean

In [19]:
df_mat['mean'].describe()

count    1000.000000
mean        3.603684
std         0.453853
min         2.022222
25%         3.312477
50%         3.621116
75%         3.915129
max         4.726254
Name: mean, dtype: float64

In [20]:
df_mat['mean'][2]

3.4835858585858586

In [42]:
mean_users = np.zeros(df_mat.shape[1])
for i in range(df_mat.shape[1]):
    if i==10000:
        mean_users[i] = df_mat.loc[:,'mean'].mean() 
    else:
        mean_users[i] = df_mat.loc[:,i].mean()

In [45]:
df_mat.loc['mean', :] = mean_users

In [46]:
df_mat.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,9951,9952,9953,9954,9955,9956,9957,9958,9959,9960,9961,9962,9963,9964,9965,9966,9967,9968,9969,9970,9971,9972,9973,9974,9975,9976,9977,9978,9979,9980,9981,9982,9983,9984,9985,9986,9987,9988,9989,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999,mean
996,,,,,,,,,,4.0,,,,4.0,3.0,,,,,,,3.0,,,,,,,,,,,,,,,3.0,2.0,,,,,,,,,,,,,...,,,,,,,,,4.0,,,,3.0,,,,2.0,,,,,2.0,4.0,,,,,,,,,,,,,,,,,,,3.0,3.0,,4.0,3.0,,2.0,3.0,3.435374
997,,5.0,,,,,,,,,,,,2.0,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,3.0,1.0,,,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,3.239407
998,,3.0,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,...,,,,,,,,,3.0,,,,,,4.0,,,,5.0,,,2.0,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,3.353982
999,,3.0,,,,1.0,,,,,3.0,,,4.0,,,,,,,,4.0,,,,,,,,,,,,,,,,5.0,,,3.0,,,,,,4.0,,,4.0,...,,,,,,3.0,4.0,,3.0,4.0,,,3.0,,,,5.0,,5.0,,,,,,5.0,,,,,,,,,,,,,,4.0,,5.0,,3.0,,,4.0,,,3.0,3.682306
mean,4.043478,3.771812,3.523256,3.8,3.985075,4.366864,4.129032,4.115385,3.97191,3.82243,3.666667,4.0875,3.671053,3.911392,3.783505,4.027397,3.926316,3.837209,3.565217,3.717391,3.851064,3.924528,3.697917,3.56338,3.433962,3.764706,3.87234,4.227273,3.954545,3.789474,4.317881,3.822581,3.9,3.93,4.22449,3.930233,3.757732,2.824074,3.973154,3.60101,3.788571,4.171429,4.155556,3.893701,4.0,3.575,3.963636,3.413043,3.9,3.740741,...,3.938272,3.597222,3.458065,4.146341,3.674419,3.427273,3.96,3.826446,4.0199,3.883871,3.851064,3.967391,4.01992,3.924051,3.568627,4.589041,3.800948,3.745455,4.04918,3.896996,4.127907,3.617647,3.806452,3.470588,4.036145,3.881356,3.263158,3.634409,3.892857,3.803571,3.324324,4.163934,4.048387,3.550459,4.081081,3.657895,4.059406,3.946429,3.873239,3.495495,4.030303,2.848837,3.936047,3.454545,4.081967,3.48062,3.560976,3.837209,3.973684,3.603684


## Load the sample submission
!
First submission will be with the mean of the movies as predictions

In [21]:
df_sub = pd.read_csv('../data/sampleSubmission.csv')

In [22]:
df_sub['movie'] = df_sub['Id'].apply(lambda x: int(x.split('_')[1][1:]))

In [23]:
df_sub.head()

Unnamed: 0,Id,Prediction,movie
0,r37_c1,3,1
1,r73_c1,3,1
2,r156_c1,3,1
3,r160_c1,3,1
4,r248_c1,3,1


Put the mean of the movies rating as the prediction.. =)

In [24]:
mean_movies = df_mat['mean'][df_sub['movie']-1]
mean_movies.index = np.arange(df_sub.shape[0])

In [25]:
df_sub['Prediction'] = mean_movies

In [26]:
df_sub = df_sub.drop('movie', axis=1)

In [27]:
df_sub.to_csv('mean_pred_movies.csv', index=False)

## Reload the sample submission

Second submission will be with the mean of the users as predictions

In [28]:
df_sub = pd.read_csv('../data/sampleSubmission.csv')

In [29]:
df_sub['user'] = df_sub['Id'].apply(lambda x: int(x.split('_')[0][1:]))

In [49]:
df_sub.head()

Unnamed: 0,Id,Prediction,user
0,r37_c1,3,37
1,r73_c1,3,73
2,r156_c1,3,156
3,r160_c1,3,160
4,r248_c1,3,248


Put the mean of the users rating as the prediction.. =)

In [53]:
mean_users = df_mat.loc['mean',:][df_sub['user']-1]
mean_users.index = np.arange(df_sub.shape[0])

In [54]:
df_sub['Prediction'] = mean_users

In [56]:
df_sub = df_sub.drop('user', axis=1)

In [57]:
df_sub.to_csv('mean_pred_user.csv', index=False)