# Imports

In [1]:
import pandas as pd
import numpy as np

tags = pd.read_csv('ml-25m/genome-tags.csv')
movies = pd.read_csv('ml-25m/movies.csv')
user_tags = pd.read_csv('ml-25m/tags.csv')
tag_scores = pd.read_csv('ml-25m/genome-scores.csv')
ratings = pd.read_csv('ml-25m/ratings.csv')
links = pd.read_csv('ml-25m/links.csv')

In [2]:
# joins for convenient filtering
movies_and_ratings = pd.merge(movies, ratings, on='movieId')
tags_and_tag_scores = pd.merge(tags,tag_scores, on='tagId')
tats_ = tags_and_tag_scores.pivot_table(index='movieId', columns='tag', values='relevance',fill_value=0.0)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,3.5,1141415820
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1439472215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4,3.0,1573944252
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,858625949
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,890492517
...,...,...,...,...,...,...
25000090,209157,We (2018),Drama,119571,1.5,1574280748
25000091,209159,Window of the Soul (2001),Documentary,115835,3.0,1574280985
25000092,209163,Bad Poems (2018),Comedy|Drama,6964,4.5,1574284913
25000093,209169,A Girl Thing (2001),(no genres listed),119571,3.0,1574291826


In [5]:
# feature matrix for all movies
tats = pd.merge(tats_,movies_and_ratings[['title','movieId']].drop_duplicates(),on='movieId')
tats

Unnamed: 0,movieId,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,...,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies,title
0,1,0.02875,0.02375,0.06250,0.07575,0.14075,0.14675,0.06350,0.20375,0.20200,...,0.01425,0.03050,0.03500,0.14125,0.05775,0.03900,0.02975,0.08475,0.02200,Toy Story (1995)
1,2,0.04125,0.04050,0.06275,0.08275,0.09100,0.06125,0.06925,0.09600,0.07650,...,0.01575,0.01250,0.02000,0.12225,0.03275,0.02100,0.01100,0.10525,0.01975,Jumanji (1995)
2,3,0.04675,0.05550,0.02925,0.08700,0.04750,0.04775,0.04600,0.14275,0.02850,...,0.01950,0.02225,0.02300,0.12200,0.03475,0.01700,0.01800,0.09100,0.01775,Grumpier Old Men (1995)
3,4,0.03425,0.03800,0.04050,0.03100,0.06500,0.03575,0.02900,0.08650,0.03200,...,0.02800,0.01675,0.03875,0.18200,0.07050,0.01625,0.01425,0.08850,0.01500,Waiting to Exhale (1995)
4,5,0.04300,0.05325,0.03800,0.04100,0.05400,0.06725,0.02775,0.07650,0.02150,...,0.02050,0.01425,0.02550,0.19225,0.02675,0.01625,0.01300,0.08700,0.01600,Father of the Bride Part II (1995)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13811,205072,0.02050,0.01775,0.11400,0.03650,0.31225,0.03675,0.10700,0.37925,0.01725,...,0.03650,0.03025,0.12900,0.13975,0.42425,0.03400,0.02350,0.41725,0.09100,Zombieland: Double Tap (2019)
13812,205076,0.03825,0.03150,0.03200,0.05325,0.20850,0.07050,0.06625,0.27825,0.00950,...,0.03225,0.04675,0.03175,0.23025,0.06300,0.04175,0.04125,0.07275,0.02350,Downton Abbey (2019)
13813,205383,0.04100,0.04025,0.02750,0.07850,0.19750,0.17825,0.17125,0.30475,0.16825,...,0.03250,0.02400,0.03575,0.20400,0.08525,0.04600,0.02900,0.11725,0.03925,El Camino: A Breaking Bad Movie (2019)
13814,205425,0.04525,0.04125,0.04250,0.07425,0.11550,0.10500,0.08275,0.13575,0.16125,...,0.04550,0.01425,0.03925,0.21700,0.06000,0.07250,0.01500,0.11050,0.02850,Dave Chappelle: Sticks & Stones (2019)


## Content-based Filtering

In [68]:
# Dot Product was too affected by feature packed movies (comedy/drama/horror/thriller/... movies)

# Cosine Similarity

rows_sims = []
rows_ids = []
rows_recs = []
features = list(tats.columns[1:-1])
ids=list(tats['movieId'])
titles=list(tats['title'])

# compute cosine similarity
sims_matrix = cosine_similarity(tats[features])
for sims in sims_matrix:
    #list comprehension for sorting our movies based on similarity metric
    sorted_sims=[u[0] for u in sorted([(i,j,k) for i,j,k in zip(sims,ids,titles)], key=lambda x: x[0], reverse=1)]
    sorted_ids=[u[1] for u in sorted([(i,j,k) for i,j,k in zip(sims,ids,titles)], key=lambda x: x[0], reverse=1)]
    sorted_recs=[u[2] for u in sorted([(i,j,k) for i,j,k in zip(sims,ids,titles)], key=lambda x: x[0], reverse=1)]
    rows_sims += [sorted_sims]
    rows_ids += [sorted_ids]
    rows_recs += [sorted_recs]

# create dfs of sorted item-item matrix for easy query and for export
columns=['input_film']
columns+=[str(i+1) for i in range(13815)]

ids_df = pd.DataFrame(rows_ids, columns=columns)
sims_df = pd.DataFrame(rows_sims, columns=columns)
recs_df = pd.DataFrame(rows_recs, columns=columns)
display(ids_df)
display(sims_df)
display(recs_df)

In [72]:
# export to csv 
# note: files are 1-5 GB
ids_df.to_csv('ids.csv')
sims_df.to_csv('sims.csv')
recs_df.to_csv('recs.csv')

In [84]:
recs_df[recs_df['input_film'].str.contains('Bug')]

Unnamed: 0,input_film,1,2,3,4,5,6,7,8,9,...,13806,13807,13808,13809,13810,13811,13812,13813,13814,13815
913,"Love Bug, The (1969)",Herbie Goes to Monte Carlo (1977),Herbie Goes Bananas (1980),Herbie Rides Again (1974),"Computer Wore Tennis Shoes, The (1969)","Shaggy D.A., The (1976)","Million Dollar Duck, The (a.k.a. $1,000,000 Du...","Absent-Minded Professor, The (1961)",Herbie: Fully Loaded (2005),"Apple Dumpling Gang, The (1975)",...,My Mother (Ma mère) (2004),Mother and Son (Mat i syn) (1997),"Sandglass, The (Sanatorium pod klepsydra) (1973)",Eureka (Yurîka) (2000),"Secret of the Grain, The (La graine et le mule...","Children Are Watching Us, The (Bambini ci guar...",Satan's Tango (Sátántangó) (1994),The Cruel Sea (1953),Blackboards (Takhté Siah) (2000),Q (2011)
2094,"Bug's Life, A (1998)",Toy Story (1995),Ice Age (2002),"Monsters, Inc. (2001)",Finding Nemo (2003),Ratatouille (2007),Toy Story 2 (1999),Antz (1998),Cars (2006),Despicable Me (2010),...,"Secret of the Grain, The (La graine et le mule...","Horror Hotel (a.k.a. City of the Dead, The) (1...","Rocking Horse Winner, The (1950)","Vampyros Lesbos (Vampiras, Las) (1971)",I Spit on Your Grave (Day of the Woman) (1978),Blackboards (Takhté Siah) (2000),"House of Exorcism, The (a.k.a. Lisa and the De...","Last Mistress, The (vieille maîtresse, Une) (2...",My Mother (Ma mère) (2004),Q (2011)
5156,"Looney, Looney, Looney Bugs Bunny Movie, The (...",Winnie the Pooh and the Blustery Day (1968),For the Birds (2000),Rabbit Seasoning (1952),Winnie the Pooh and Tigger Too (1974),Robin Hood (1973),Partly Cloudy (2009),Paddington 2 (2017),Winnie the Pooh and the Honey Tree (1966),"Tigger Movie, The (2000)",...,Uncle Boonmee Who Can Recall His Past Lives (L...,Paris Was a Woman (1995),Satan's Tango (Sátántangó) (1994),Take Care of My Cat (Goyangileul butaghae) (2001),"Vampyros Lesbos (Vampiras, Las) (1971)",Blackboards (Takhté Siah) (2000),Valerie and Her Week of Wonders (Valerie a týd...,"Last Mistress, The (vieille maîtresse, Une) (2...",My Mother (Ma mère) (2004),Q (2011)
5967,Bugsy (1991),Hoffa (1992),Donnie Brasco (1997),Casino (1995),Kill the Irishman (2011),Little Caesar (1931),Black Mass (2015),"Untouchables, The (1987)",Owning Mahowny (2003),American Gangster (2007),...,The Madagascar Penguins in a Christmas Caper (...,Casper Meets Wendy (1998),"Haunted House 2, A (2014)",Resident Evil: Retribution (2012),Blackboards (Takhté Siah) (2000),Resident Evil: Afterlife (2010),Twitches (2005),At the Earth's Core (1976),My Mother (Ma mère) (2004),Q (2011)
5968,Bugsy Malone (1976),"Cotton Club, The (1984)","Lady in Red, The (1979)",On the Town (1949),"Little Rascals, The (1994)","New Adventures of Pippi Longstocking, The (1988)",Guys and Dolls (1955),"Pajama Game, The (1957)",Kiss Me Kate (1953),Big Shots (1987),...,"Dyatlov Pass Incident, The (Devil's Pass) (2013)","Last Mistress, The (vieille maîtresse, Une) (2...",Resident Evil: Extinction (2007),Priest (2011),Alien Outpost (2014),Resident Evil: Retribution (2012),Resident Evil: Afterlife (2010),The Cruel Sea (1953),My Mother (Ma mère) (2004),Q (2011)
8942,Bug (2007),Dead Ringers (1988),Audition (Ôdishon) (1999),May (2002),Goodnight Mommy (Ich seh ich seh) (2014),The House That Jack Built (2018),The Killing of a Sacred Deer (2017),In My Skin (Dans ma Peau) (2002),Shivers (They Came from Within) (1975),Apartment Zero (1988),...,Facing the Giants (2006),Fantastic Four: Rise of the Silver Surfer (2007),Chronicles of Narnia: The Voyage of the Dawn T...,Scooby-Doo and the Cyber Chase (2001),"Princess Diaries, The (2001)",Race (2016),Chasing Liberty (2004),Twitches (2005),Miracles from Heaven (2016),The Madagascar Penguins in a Christmas Caper (...
10324,"Bugs Bunny / Road Runner Movie, The (a.k.a. Th...","Looney, Looney, Looney Bugs Bunny Movie, The (...",Rabbit Seasoning (1952),Paddington 2 (2017),Winnie the Pooh and the Honey Tree (1966),Presto (2008),Who Framed Roger Rabbit? (1988),Duck Amuck (1953),6 Days to Air: The Making of South Park (2011),"Simpsons Movie, The (2007)",...,Take Care of My Cat (Goyangileul butaghae) (2001),Blackboards (Takhté Siah) (2000),Priest (2011),Captain America (1990),My Mother (Ma mère) (2004),Alien Outpost (2014),Survivor (2015),Resident Evil: Afterlife (2010),Resident Evil: Retribution (2012),Q (2011)
