In [41]:
#imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity
from scipy import sparse

This notebook is for creating visualizations to be used in my web app / executive summary (readme)

In [88]:
movies = ['Mulan', 'Frozen', 'The Shining','Batman Returns',
         'Mulan', 'Frozen', 'The Shining', 'Batman Returns',
         'Mulan', 'Frozen', 'The Shining', 'Batman Returns']

users = ['user_1', 'user_1','user_1', 'user_1',
         'user_2','user_2','user_2', 'user_2',
        'user_3','user_3','user_3', 'user_3']

ratings = [5,5,0,3,
          0,1,5,4,
          4,5,1,1]

test_df = pd.DataFrame(data={
    'Movie': movies,
    'User' : users,
    'Rating' : ratings
})
test_df

Unnamed: 0,Movie,User,Rating
0,Mulan,user_1,5
1,Frozen,user_1,5
2,The Shining,user_1,0
3,Batman Returns,user_1,3
4,Mulan,user_2,0
5,Frozen,user_2,1
6,The Shining,user_2,5
7,Batman Returns,user_2,4
8,Mulan,user_3,4
9,Frozen,user_3,5


In [89]:
pivot = pd.pivot_table(test_df, index='Movie', columns='User', values='Rating')
pivot

User,user_1,user_2,user_3
Movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Batman Returns,3,4,1
Frozen,5,1,5
Mulan,5,0,4
The Shining,0,5,1


The movie "vectors" are:  
- Batman Returns [2,4,1,4,5]
- Frozen [5,0,5,1,0]
- Mulan [5,1,4,4,0]
- The Shining [0,5,1,0,3]

In [102]:
dists = pairwise_distances(pivot, metric='cosine')
recommender_df = pd.DataFrame(data = dists, 
                              index=['Batman Returns', 'Frozen', 'Mulan', 'The Shining'], 
                                     columns=['Batman Returns', 'Frozen', 'Mulan', 'The Shining'])
round(recommender_df,2)

Unnamed: 0,Batman Returns,Frozen,Mulan,The Shining
Batman Returns,0.0,0.34,0.42,0.19
Frozen,0.34,0.0,0.02,0.73
Mulan,0.42,0.02,0.0,0.88
The Shining,0.19,0.73,0.88,0.0


The greater the distance between two items, the more dissimilar the recommender finds them to be (max dist is 1)

- As we would expect, each film has a 0 distance from itself!
- Frozen and Mulan are very similar (0.09)
- People who liked The Shining generally liked Batman Returns (0.10)
- The two most different films according to our recommender are Batman Returns and Mulan

In [103]:
round(recommender_df['Frozen'].sort_values(),2)

Frozen            0.00
Mulan             0.02
Batman Returns    0.34
The Shining       0.73
Name: Frozen, dtype: float64

In [104]:
#printing tables to markdown for use in readme

In [105]:
print(pivot.to_markdown())

| Movie          |   user_1 |   user_2 |   user_3 |
|:---------------|---------:|---------:|---------:|
| Batman Returns |        3 |        4 |        1 |
| Frozen         |        5 |        1 |        5 |
| Mulan          |        5 |        0 |        4 |
| The Shining    |        0 |        5 |        1 |


In [108]:
print(round(recommender_df,2).to_markdown())

|                |   Batman Returns |   Frozen |   Mulan |   The Shining |
|:---------------|-----------------:|---------:|--------:|--------------:|
| Batman Returns |             0    |     0.34 |    0.42 |          0.19 |
| Frozen         |             0.34 |     0    |    0.02 |          0.73 |
| Mulan          |             0.42 |     0.02 |    0    |          0.88 |
| The Shining    |             0.19 |     0.73 |    0.88 |          0    |


In [110]:
print(round(recommender_df['Frozen'].sort_values(),2).to_markdown())

|                |   Frozen |
|:---------------|---------:|
| Frozen         |     0    |
| Mulan          |     0.02 |
| Batman Returns |     0.34 |
| The Shining    |     0.73 |


Perfect! I'll use these tables + a visualization I created from geogebra.org in my ReadMe, project, and hopefully web app as well!

In [167]:
pivot.to_pickle('./compressed/sample_pivot.pkl', protocol=3)

In [169]:
recommender_df = round(recommender_df, 2)

In [171]:
recommender_df.to_pickle('./compressed/sample_rec.pkl', protocol=3)

In [170]:
recommender_df

Unnamed: 0,Batman Returns,Frozen,Mulan,The Shining
Batman Returns,0.0,0.34,0.42,0.19
Frozen,0.34,0.0,0.02,0.73
Mulan,0.42,0.02,0.0,0.88
The Shining,0.19,0.73,0.88,0.0


In [173]:
fro_df = round(recommender_df['Frozen'].sort_values(),2)

In [174]:
fro_df.to_pickle('./compressed/fro_df.pkl', protocol=3)

Alright - that's the markdown I needed for my ReadMe and the pickled dataframes I'll use in the online web app. All set!