# Recommendation System

In [16]:
from collections import Counter

import pandas as pd
import numpy as np

import pickle as pk

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from jupyterthemes import jtplot
jtplot.style(figsize=(15, 9))

## Data

### Movies

In [2]:
movies = pd.read_csv('../data/popular_10000_movies/movies_dataset_CLEAN.csv', index_col='id')
movies.head()

Unnamed: 0_level_0,genre_ids,overview,popularity,release_date,title,vote_average,vote_count,genres,cast,crew,keywords
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
385687,"[28, 80, 53]",Over many missions and against impossible odds...,4654.279,2023-05-17,Fast X,7.3,2093,"Action, Crime, Thriller","[{'cast_id': 12835, 'name': 'Vin Diesel', 'cha...","[{'crew_id': 1302, 'name': 'Susie Figgis', 'de...","[{'id': 9663, 'name': 'sequel'}, {'id': 9748, ..."
697843,"[28, 53]",Tasked with extracting a family who is at the ...,2494.01,2023-06-09,Extraction 2,7.7,910,"Action, Thriller","[{'cast_id': 74568, 'name': 'Chris Hemsworth',...","[{'crew_id': 950, 'name': 'Pietro Scalia', 'de...","[{'id': 3070, 'name': 'mercenary'}, {'id': 966..."
603692,"[28, 53, 80]","With the price on his head ever increasing, Jo...",1920.127,2023-03-22,John Wick: Chapter 4,7.9,3344,"Action, Thriller, Crime","[{'cast_id': 6384, 'name': 'Keanu Reeves', 'ch...","[{'crew_id': 3615, 'name': 'Manfred Banach', '...","[{'id': 242, 'name': 'new york city'}, {'id': ..."
569094,"[28, 12, 16, 878]","After reuniting with Gwen Stacy, Brooklyn’s fu...",2013.795,2023-05-31,Spider-Man: Across the Spider-Verse,8.6,1796,"Action, Adventure, Animation, Science Fiction","[{'cast_id': 587506, 'name': 'Shameik Moore', ...","[{'crew_id': 7624, 'name': 'Stan Lee', 'depart...","[{'id': 2858, 'name': 'sacrifice'}, {'id': 328..."
502356,"[16, 10751, 12, 14, 35]","While working underground to fix a water main,...",1539.037,2023-04-05,The Super Mario Bros. Movie,7.8,5165,"Animation, Family, Adventure, Fantasy, Comedy","[{'cast_id': 73457, 'name': 'Chris Pratt', 'ch...","[{'crew_id': 70851, 'name': 'Jack Black', 'dep...","[{'id': 282, 'name': 'video game'}, {'id': 690..."


In [3]:
movies = movies[['popularity', 'release_date', 'title', 'vote_average', 'vote_count']]
movies.head()

Unnamed: 0_level_0,popularity,release_date,title,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
385687,4654.279,2023-05-17,Fast X,7.3,2093
697843,2494.01,2023-06-09,Extraction 2,7.7,910
603692,1920.127,2023-03-22,John Wick: Chapter 4,7.9,3344
569094,2013.795,2023-05-31,Spider-Man: Across the Spider-Verse,8.6,1796
502356,1539.037,2023-04-05,The Super Mario Bros. Movie,7.8,5165


In [4]:
movies.dtypes

popularity      float64
release_date     object
title            object
vote_average    float64
vote_count        int64
dtype: object

### Genres: cosine similarity and one-hot labeling

In [5]:
genres_cs = pd.read_csv('data/genres_cosine_similarity.csv', index_col=0)
genres_cs

Unnamed: 0,TV Movie,Thriller,Action,Mystery,Crime,Documentary,Adventure,Science Fiction,Drama,Animation,Music,Fantasy,Family,Romance,Western,War,History,Horror,Comedy
TV Movie,1.0,0.394733,0.513236,0.39673,0.364897,0.273702,0.592527,0.53523,0.511371,0.599006,0.580721,0.61596,0.676505,0.495483,0.354297,0.326719,0.332524,0.410583,0.677208
Thriller,0.394733,1.0,0.717784,0.803086,0.803562,0.252943,0.478655,0.632004,0.747659,0.276127,0.26195,0.419476,0.280956,0.495854,0.468874,0.527351,0.504018,0.717057,0.440748
Action,0.513236,0.717784,1.0,0.554684,0.650536,0.291962,0.797106,0.781199,0.623967,0.589238,0.364484,0.625936,0.571553,0.473524,0.555539,0.573059,0.479258,0.612385,0.641312
Mystery,0.39673,0.803086,0.554684,1.0,0.702164,0.263122,0.456686,0.554591,0.662259,0.365163,0.314863,0.449878,0.36879,0.471931,0.387748,0.379926,0.384377,0.646352,0.491783
Crime,0.364897,0.803562,0.650536,0.702164,1.0,0.289151,0.422055,0.450467,0.70113,0.281047,0.310511,0.34936,0.297321,0.481107,0.460968,0.420063,0.462907,0.507797,0.503363
Documentary,0.273702,0.252943,0.291962,0.263122,0.289151,1.0,0.247916,0.216827,0.337446,0.169529,0.257017,0.192158,0.198267,0.221919,0.214639,0.240606,0.312933,0.220365,0.274301
Adventure,0.592527,0.478655,0.797106,0.456686,0.422055,0.247916,1.0,0.719617,0.537524,0.801408,0.530574,0.794288,0.831916,0.530338,0.53405,0.451217,0.386237,0.464983,0.784823
Science Fiction,0.53523,0.632004,0.781199,0.554591,0.450467,0.216827,0.719617,1.0,0.512783,0.598825,0.354121,0.613756,0.575048,0.428716,0.385528,0.382743,0.295121,0.641385,0.601664
Drama,0.511371,0.747659,0.623967,0.662259,0.70113,0.337446,0.537524,0.512783,1.0,0.351065,0.497333,0.496137,0.432878,0.725442,0.603654,0.706889,0.748685,0.516213,0.542533
Animation,0.599006,0.276127,0.589238,0.365163,0.281047,0.169529,0.801408,0.598825,0.351065,1.0,0.590689,0.775147,0.926371,0.421453,0.339857,0.239284,0.19883,0.339585,0.789651


In [6]:
genres_oh = pd.read_csv('data/one-hot_genres.csv', index_col='id')
genres_oh.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
697843,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
603692,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
569094,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
502356,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0


In [7]:
genres_oh.loc[movies.index]

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
697843,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
603692,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
569094,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
502356,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15017,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
79509,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
13370,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
480623,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0


In [8]:
genres_oh = genres_oh[~genres_oh.index.duplicated(keep='first')]

In [9]:
genres_oh.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
385687,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
697843,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
603692,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
569094,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
502356,0,1,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0


In [10]:
genres_oh.shape, movies.shape

((9939, 19), (9144, 5))

### Keywords

In [12]:
keywords = pd.read_csv('data/keywords_list.csv', index_col='id')
keywords.head()

Unnamed: 0_level_0,keywords_list
id,Unnamed: 1_level_1
385687,"['sequel', 'revenge', 'racing', 'family', 'cars']"
697843,"['mercenary', 'sequel', 'rescue mission', 'lon..."
603692,"['new york city', 'martial arts', 'hitman', 's..."
569094,"['sacrifice', 'villain', 'comic book', 'sequel..."
502356,"['video game', 'gorilla', 'plumber', 'magic mu..."


In [13]:
keywords.shape

(9144, 1)

In [43]:
with open('data/keywords_counter.pickle', 'rb') as f:
    keywords_counter = pk.load(f)

In [45]:
len(keywords_counter)

14239

### Cast

In [14]:
cast = pd.read_csv('data/cast_list.csv', index_col='id')
cast.head()

Unnamed: 0_level_0,cast_list
id,Unnamed: 1_level_1
385687,"['Vin Diesel', 'Michelle Rodriguez', 'Tyrese G..."
697843,"['Chris Hemsworth', 'Golshifteh Farahani', 'Ad..."
603692,"['Keanu Reeves', 'Donnie Yen', 'Bill Skarsgård..."
569094,"['Shameik Moore', 'Hailee Steinfeld', 'Brian T..."
502356,"['Chris Pratt', 'Anya Taylor-Joy', 'Charlie Da..."


In [15]:
cast.shape

(9133, 1)

## Hey, beauty, what do you want to watch tonight?

In [10]:
movies_full = pd.read_csv('../data/popular_10000_movies/movies_dataset_CLEAN.csv', index_col='id')

In [11]:
something_like = movies_full[movies_full['title'].isin(['Starship Troopers', 'Aliens', 'Pitch Black'])]
something_like

Unnamed: 0_level_0,genre_ids,overview,popularity,release_date,title,vote_average,vote_count,genres,cast,crew,keywords
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
679,"[28, 53, 878]",When Ripley's lifepod is found by a salvage cr...,59.005,1986-07-18,Aliens,7.9,8542,"Action, Thriller, Science Fiction","[{'cast_id': 10205, 'name': 'Sigourney Weaver'...","[{'crew_id': 8380, 'name': 'Michael Lamont', '...","[{'id': 803, 'name': 'android'}, {'id': 1603, ..."
563,"[12, 28, 53, 878]","Set in the future, the story follows a young s...",28.171,1997-11-07,Starship Troopers,7.0,4268,"Adventure, Action, Thriller, Science Fiction","[{'cast_id': 27763, 'name': 'Casper Van Dien',...","[{'crew_id': 3686, 'name': 'Johanna Ray', 'dep...","[{'id': 305, 'name': 'moon'}, {'id': 818, 'nam..."
2787,"[53, 878, 28]",When their ship crash-lands on a remote planet...,14.899,2000-02-18,Pitch Black,6.8,3974,"Thriller, Science Fiction, Action","[{'cast_id': 12835, 'name': 'Vin Diesel', 'cha...","[{'crew_id': 2214, 'name': 'Paul Haslinger', '...","[{'id': 3762, 'name': 'darkness'}, {'id': 4565..."


In [39]:
print('Aliens:')
print(keywords.loc[679].to_list()[0])

Aliens:
['android', 'extraterrestrial technology', 'space marine', 'spaceman', 'cryogenics', 'vacuum', 'space colony', 'warrior woman', 'settler', 'space travel', 'colony', 'space', 'alien', 'creature', 'desolate', 'xenomorph', 'desolate planet']


In [40]:
print('Starship Troopers:')
print(keywords.loc[563].to_list()[0])

Starship Troopers:
['moon', 'based on novel or book', 'asteroid', 'spacecraft', 'space marine', 'intelligence', 'buenos aires, argentina', 'space battle', 'dystopia', 'army', 'giant insect', 'satire', 'creature', 'soldier', 'drill instructor', 'military', 'bugs']


In [41]:
print('Pitch Black:')
print(keywords.loc[2787].to_list()[0])

Pitch Black:
['darkness', 'dystopia', 'comet', 'alien life-form', 'survival', 'creature', 'eclipse', 'flask', 'spaceship crash']


In [77]:
max(len(keywords_counter['android'].keys()), len(keywords_counter['moon'].keys()), len(keywords_counter['darkness'].keys()))

304

In [15]:
genres_choice = something_like['genres'].to_list()
genres_choice

['Action, Thriller, Science Fiction',
 'Adventure, Action, Thriller, Science Fiction',
 'Thriller, Science Fiction, Action']