In [1]:
import pandas as pd
import numpy as np
import re
from textblob import Word

import pandasql as ps
from pandasql import sqldf

import matplotlib.pyplot as plt
import seaborn as sns

# Ignore matplotlib warnings
# import warnings
# warnings.filterwarnings("ignore")

In [2]:
tv = pd.read_csv("../TVTropesData/tv_tropes.csv")
tv["media"] = "tv"

In [3]:
film = pd.read_csv("../TVTropesData/film_tropes.csv")
film["media"] = "film"

In [4]:
lit = pd.read_csv("../TVTropesData/lit_tropes.csv")
lit["media"] = "lit"

In [5]:
df = pd.concat([tv, film, lit])

In [6]:
# Replace Longrunner with LongRunner
df.loc[df.Trope=='Longrunner', 'Trope'] = 'LongRunner'

In [7]:
# Split at capitalize words and add underscore
def convert(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

In [8]:
# Apply to df
df.Trope = df.Trope.apply(lambda w: convert(w))

In [9]:
# Convert plural to singular to get rid of duplicates
df['Trope'] = df.Trope.apply(lambda w: Word(w).singularize())

# Trope by title

In [10]:
trope_by_title = df.groupby(['Title','media']).agg({"Trope": lambda x: " ".join(x)}).reset_index()

In [11]:
trope_by_title.head()

Unnamed: 0,Title,media,Trope
0,ABBATheMovie,film,ms_fanservice insistent_terminology the_ingenu...
1,ABCsOfDeath2,film,subverted_kids_show ass_shove man_bites_man st...
2,ABadCaseOfStripes,lit,stock_yuck involuntary_shapeshifting involunta...
3,ABadDayForVoodoo,lit,sadist_teacher bread_eggs_breaded_egg sadistic...
4,ABatalhaDoApocalipse,lit,dead_all_along our_angels_are_different weak_s...


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

### Trope matrix

In [13]:
vectorizer = CountVectorizer()
trope_matrix = vectorizer.fit_transform(trope_by_title.Trope)

In [14]:
print("Trope Matrix:", trope_matrix.toarray())

Trope Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### Cosine similarity

In [15]:
# Source: https://towardsdatascience.com/using-cosine-similarity-to-build-a-movie-recommendation-system-ae7f20842599

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(trope_matrix)

In [16]:
cosine_sim.shape

(40435, 40435)

In [17]:
# https://machinelearninggeek.com/movie-recommender-system-using-text-similarity/

# Create a pandas series with titles as indices and indices as series values 
indices = pd.Series(trope_by_title.index, index=trope_by_title['Title']).drop_duplicates()

In [18]:
title='HowIMetYourMother'

# Get the index corresponding to movie title
index = indices[title]

# Get the cosine similarity scores 
similarity_scores = list(enumerate(cosine_sim[index]))


# Sort the similarity scores in descending order
sorted_similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


# Top-10 most similar movie scores
top_10_movies_scores = sorted_similarity_scores[1:11]

# Get movie indices
top_10_movie_indices=[]
for i in top_10_movies_scores:
    top_10_movie_indices.append(i[0])
    
# Top 10 recommende movie
trope_by_title['Title'].iloc[top_10_movie_indices]

10559                  Friends
28360         TheBigBangTheory
24267                   Scrubs
10443                  Frasier
5952                 Community
27684              That70sShow
5269                    Cheers
4426     BuffyTheVampireSlayer
24411                 Seinfeld
36770               ThirtyRock
Name: Title, dtype: object

In [22]:
title='TheDarkKnightRises'

# Get the index corresponding to movie title
index = indices[title]

# Get the cosine similarity scores 
similarity_scores = list(enumerate(cosine_sim[index]))


# Sort the similarity scores in descending order
sorted_similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


# Top-10 most similar movie scores
top_10_movies_scores = sorted_similarity_scores[1:11]

# Get movie indices
top_10_movie_indices=[]
for i in top_10_movies_scores:
    top_10_movie_indices.append(i[0])
    
# Top 10 recommende movie
trope_by_title['Title'].iloc[top_10_movie_indices]

29546                     TheDarkKnight
29549              TheDarkKnightTrilogy
2759                       BatmanBegins
4775     CaptainAmericaTheWinterSoldier
2375                AvengersAgeOfUltron
26230              StarTrekIntoDarkness
2770       BatmanVSupermanDawnOfJustice
4772             CaptainAmericaCivilWar
2768                      BatmanReturns
2756                         Batman1989
Name: Title, dtype: object

In [20]:
title='FateStayNight'

# Get the index corresponding to movie title
index = indices[title]

# Get the cosine similarity scores 
similarity_scores = list(enumerate(cosine_sim[index]))


# Sort the similarity scores in descending order
sorted_similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


# Top-10 most similar movie scores
top_10_movies_scores = sorted_similarity_scores[1:11]

# Get movie indices
top_10_movie_indices=[]
for i in top_10_movies_scores:
    top_10_movie_indices.append(i[0])
    
# Top 10 recommende movie
trope_by_title['Title'].iloc[top_10_movie_indices]

9635                          FateZero
29922                  TheDresdenFiles
31193             TheHammerAndTheCross
37773                        Tsukihime
51                ACertainMagicalIndex
12731                    HighSchoolDXD
7778                         DiscWorld
29273    TheConfessionsOfPeterCrossman
15585                 KnightLifeSeries
30572                   TheFourthRealm
Name: Title, dtype: object