# Part 1 - Build the model

# Read in the data

In [1]:
import pandas as pd

ratings = pd.read_csv('ratings.csv', index_col=0)

# Inspect the data

In [2]:
ratings.fillna('')

Unnamed: 0_level_0,Bob,Jane,Alice,Lesley,Frank,Kate
Game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arkham Horror,5.0,8.0,,9.1,8.4,5.0
Carcassonne,10.0,8.0,7.0,7.6,,6.0
Cards Against Humanity,,7.0,5.0,,7.0,4.0
Pandemic Legacy: Season 1,,,,9.1,8.2,
Dominion,3.0,8.0,9.0,,4.3,8.0


In [3]:
ratings

Unnamed: 0_level_0,Bob,Jane,Alice,Lesley,Frank,Kate
Game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arkham Horror,5.0,8.0,,9.1,8.4,5.0
Carcassonne,10.0,8.0,7.0,7.6,,6.0
Cards Against Humanity,,7.0,5.0,,7.0,4.0
Pandemic Legacy: Season 1,,,,9.1,8.2,
Dominion,3.0,8.0,9.0,,4.3,8.0


# Calculating item similarities

## How we'll do this

### Similarity == Angle between ratings vectors
![](angle.png)
### Calculting the angle (cosine similarity)
![](similarity.png)
### Helpful to precalculate these
![](precalculate.png)

## Normalize the ratings

In [4]:
means = ratings.mean()
means

Bob       6.000
Jane      7.750
Alice     7.000
Lesley    8.600
Frank     6.975
Kate      5.750
dtype: float64

In [5]:
normalized = ratings - means
normalized

Unnamed: 0_level_0,Bob,Jane,Alice,Lesley,Frank,Kate
Game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arkham Horror,-1.0,0.25,,0.5,1.425,-0.75
Carcassonne,4.0,0.25,0.0,-1.0,,0.25
Cards Against Humanity,,-0.75,-2.0,,0.025,-1.75
Pandemic Legacy: Season 1,,,,0.5,1.225,
Dominion,-3.0,0.25,2.0,,-2.675,2.25


## Precompute item square root of sum of squares
![](precalculate.png)

In [6]:
normalized.pow(2)

Unnamed: 0_level_0,Bob,Jane,Alice,Lesley,Frank,Kate
Game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arkham Horror,1.0,0.0625,,0.25,2.030625,0.5625
Carcassonne,16.0,0.0625,0.0,1.0,,0.0625
Cards Against Humanity,,0.5625,4.0,,0.000625,3.0625
Pandemic Legacy: Season 1,,,,0.25,1.500625,
Dominion,9.0,0.0625,4.0,,7.155625,5.0625


In [7]:
normalized.pow(2).sum(axis=1) # sum columns instead of rows

Game
Arkham Horror                 3.905625
Carcassonne                  17.125000
Cards Against Humanity        7.625625
Pandemic Legacy: Season 1     1.750625
Dominion                     25.280625
dtype: float64

In [8]:
normalized.pow(2).sum(axis=1).pow(0.5)

Game
Arkham Horror                1.976265
Carcassonne                  4.138236
Cards Against Humanity       2.761453
Pandemic Legacy: Season 1    1.323112
Dominion                     5.027984
dtype: float64

In [9]:
sqrt_sum_squares = normalized.pow(2).sum(axis=1).pow(0.5)
rendering = normalized.copy()
rendering['Sqrt Sum Squares'] = sqrt_sum_squares
rendering

Unnamed: 0_level_0,Bob,Jane,Alice,Lesley,Frank,Kate,Sqrt Sum Squares
Game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Arkham Horror,-1.0,0.25,,0.5,1.425,-0.75,1.976265
Carcassonne,4.0,0.25,0.0,-1.0,,0.25,4.138236
Cards Against Humanity,,-0.75,-2.0,,0.025,-1.75,2.761453
Pandemic Legacy: Season 1,,,,0.5,1.225,,1.323112
Dominion,-3.0,0.25,2.0,,-2.675,2.25,5.027984


## Calculate similarities

In [10]:
similarity = pd.DataFrame(index = ratings.index.values, columns = ratings.index.values) # blank matrix
similarity

Unnamed: 0,Arkham Horror,Carcassonne,Cards Against Humanity,Pandemic Legacy: Season 1,Dominion
Arkham Horror,,,,,
Carcassonne,,,,,
Cards Against Humanity,,,,,
Pandemic Legacy: Season 1,,,,,
Dominion,,,,,


![](similarity.png)

In [24]:
for i in similarity.index.values:
    for j in similarity.index.values:
        numerator = (normalized.loc[i] * normalized.loc[j]).sum()
        denominator = sqrt_sum_squares.loc[i] * sqrt_sum_squares.loc[j]
        similarity.loc[i,j] = numerator / denominator
        
similarity

Unnamed: 0,Arkham Horror,Carcassonne,Cards Against Humanity,Pandemic Legacy: Season 1,Dominion
Arkham Horror,1.0,-0.565524,0.212671,0.763198,-0.245242
Carcassonne,-0.565524,1.0,-0.0546924,-0.0913184,-0.546691
Cards Against Humanity,0.212671,-0.0546924,1.0,0.00838189,-0.59
Pandemic Legacy: Season 1,0.763198,-0.0913184,0.00838189,1.0,-0.492572
Dominion,-0.245242,-0.546691,-0.59,-0.492572,1.0


# Part 2 - Apply the model


# Recommend similar items

In [12]:
similarity.sort_values('Arkham Horror', ascending=False)

Unnamed: 0,Arkham Horror,Carcassonne,Cards Against Humanity,Pandemic Legacy: Season 1,Dominion
Arkham Horror,1.0,-0.565524,0.212671,0.763198,-0.245242
Pandemic Legacy: Season 1,0.763198,-0.0913184,0.00838189,1.0,-0.492572
Cards Against Humanity,0.212671,-0.0546924,1.0,0.00838189,-0.59
Dominion,-0.245242,-0.546691,-0.59,-0.492572,1.0
Carcassonne,-0.565524,1.0,-0.0546924,-0.0913184,-0.546691


In [13]:
def most_similar_to(item):
    ranked = similarity[item].sort_values(ascending=False)
    return (ranked.index[1], ranked[1])
    
most_similar_to('Arkham Horror')   

('Pandemic Legacy: Season 1', 0.76319778593577769)

# Predict user ratings
![](predict.png)
i.e. predict the rating for item j based on the ratings given to other items (i) and their ratings.

We'll first calculate, by hand, Bob's rating for Cards Against Humanity.

### Calculate the numerator
![](numerator.png)

In [14]:
normalized

Unnamed: 0_level_0,Bob,Jane,Alice,Lesley,Frank,Kate
Game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arkham Horror,-1.0,0.25,,0.5,1.425,-0.75
Carcassonne,4.0,0.25,0.0,-1.0,,0.25
Cards Against Humanity,,-0.75,-2.0,,0.025,-1.75
Pandemic Legacy: Season 1,,,,0.5,1.225,
Dominion,-3.0,0.25,2.0,,-2.675,2.25


In [15]:
normalized['Bob'] * similarity.loc['Cards Against Humanity']

Game
Arkham Horror               -0.212671
Carcassonne                  -0.21877
Cards Against Humanity            NaN
Pandemic Legacy: Season 1         NaN
Dominion                         1.77
dtype: object

### Calculate the denominator
![](denominator.png)

In [16]:
normalized['Bob'].abs()

Game
Arkham Horror                1.0
Carcassonne                  4.0
Cards Against Humanity       NaN
Pandemic Legacy: Season 1    NaN
Dominion                     3.0
Name: Bob, dtype: float64

### Calculate the predicted normalized rating

In [17]:
numerator =(normalized['Bob'] * similarity.loc['Cards Against Humanity']).sum()
denominator = normalized['Bob'].abs().sum()

numerator / denominator

0.16731972021500008

### Calculated the predicted rating

In [18]:
(numerator / denominator) + means['Bob']

6.1673197202149996

### Predict for any user + item

In [19]:
def predict_rating_for(user, item):
    return (ratings[user] * similarity.loc[item]).sum() / ratings[user].abs().sum() + means[user]
    
predict_rating_for('Bob', 'Cards Against Humanity')

5.9303574377218036

# Congratulations! That's all the theory you need to get started!