### Here our Results

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import sys
import os
import requests
import swifter

from tqdm import tqdm
from pygments.lexers.webassembly import keywords

from src.data.TMDB_Movies import get_data
from src.data.TMDB_Movies import get_collection

current_dir = os.path.abspath("")
sys.path.append(current_dir)
data_dir = os.path.join(current_dir, "data")


In [2]:
#we want to organize the data in tvclusters into a dataframe and sort by actor


# Read the text file into a DataFrame
tv_tropes_path = os.path.join(data_dir, "MovieSummaries/tvtropes.clusters.txt")
tv_clusters = pd.read_csv(tv_tropes_path, sep='\t', header=None, names=['trope', 'details'])

# Parse the JSON string in the 'details' column and expand it into separate columns
tv_clusters = tv_clusters.join(tv_clusters['details'].apply(json.loads).apply(pd.Series))

# Drop the 'details' column as it's no longer needed
tv_clusters.drop(columns=['details'], inplace=True)

# Display the DataFrame


tv_clusters_char = tv_clusters.sort_values(by='char')

tv_clusters_char = tv_clusters_char.reset_index(drop=True)

len(tv_clusters_char)


501

In [None]:
# try to get the movies with sequels 

char_count = tv_clusters_char.groupby('char').size()

multiple_char = char_count[char_count > 1]

multiple_char


In [70]:
tv_clusters_char = tv_clusters_char[tv_clusters_char['char'].isin(multiple_char.index)]

tv_clusters_char = tv_clusters_char.reset_index(drop=True)
tv_clusters_char.head(25)


Unnamed: 0,trope,char,movie,id,actor
0,arrogant_kungfu_guy,Anakin Skywalker,Star Wars Episode III: Revenge of the Sith,/m/02sg5ry,Hayden Christensen
1,child_prodigy,Anakin Skywalker,Star Wars Episode I: The Phantom Menace,/m/02sg5rn,Jake Lloyd
2,byronic_hero,Anakin Skywalker,Star Wars Episode III: Revenge of the Sith,/m/02sg5ry,Hayden Christensen
3,gadgeteer_genius,Anakin Skywalker,Star Wars Episode I: The Phantom Menace,/m/02sg5rn,Jake Lloyd
4,warrior_poet,Arthur Burns,The Proposition,/m/0k66rt,Danny Huston
5,cultured_badass,Arthur Burns,The Proposition,/m/0k66rt,Danny Huston
6,jerk_jock,Biff Tannen,Back to the Future,/m/0jty60,Thomas F. Wilson
7,bully,Biff Tannen,Back to the Future,/m/0jty60,Thomas F. Wilson
8,crazy_jealous_guy,Bill,Kill Bill Volume 1,/m/0j_fvv,David Carradine
9,heartbroken_badass,Bill,Kill Bill Volume 1,/m/0j_fvv,David Carradine


In [82]:
#get only movies with sequels 
sequel_char = tv_clusters_char.groupby('char')['movie'].nunique()

sequel_char = sequel_char[sequel_char > 1]  

tv_clusters_sequel = tv_clusters_char[tv_clusters_char['char'].isin(sequel_char.index)]


tv_clusters_sequel = tv_clusters_sequel.reset_index(drop=True)


In [88]:
tv_clusters_sequel.head(35)

Unnamed: 0,trope,char,movie,id,actor
0,arrogant_kungfu_guy,Anakin Skywalker,Star Wars Episode III: Revenge of the Sith,/m/02sg5ry,Hayden Christensen
1,child_prodigy,Anakin Skywalker,Star Wars Episode I: The Phantom Menace,/m/02sg5rn,Jake Lloyd
2,byronic_hero,Anakin Skywalker,Star Wars Episode III: Revenge of the Sith,/m/02sg5ry,Hayden Christensen
3,gadgeteer_genius,Anakin Skywalker,Star Wars Episode I: The Phantom Menace,/m/02sg5rn,Jake Lloyd
4,dumb_blonde,Cindy Campbell,Scary Movie 3,/m/0k2z05,Anna Faris
5,dumb_blonde,Cindy Campbell,Scary Movie 4,/m/0k2y_7,Anna Faris
6,corrupt_corporate_executive,Cutler Beckett,Pirates of the Caribbean: Dead Man's Chest,/m/0k1xyw,Tom Hollander
7,corrupt_corporate_executive,Cutler Beckett,Pirates of the Caribbean: At World's End,/m/0k1xwx,Tom Hollander
8,charmer,Han Solo,Star Wars Episode IV: A New Hope,/m/0k3r1_,Harrison Ford
9,loveable_rogue,Han Solo,Star Wars Episode IV: A New Hope,/m/0k3r1_,Harrison Ford


### 2.1 Data Collection

Here we download the data from TMDB.


In [5]:
keywords_name = ["sequel", "book", "comic", "remake"]
keywords = [9663, 818, 9717, 9714] #keywords for the movies corresponding to the sequels, book or novel adaptations, and based on comics, and remakes

for i, keyword in enumerate(keywords) :
    get_data(keywords_name[i], "2013-01-01", keyword)

100%|██████████| 43/43 [00:07<00:00,  5.70it/s]


JSONDecodeError: Extra data: line 1 column 86 (char 85)

### 2.2 Use TMDB collection to get group the sequels together.

In [None]:
sequels_movies = pd.read_csv('data/movie_with_keyword_sequel.csv')
get_collection(sequels_movies)


### 2.3 Find the movies in the Dataset using TMDB collection

In [85]:
url = "https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&format=json&titles=Harry%20Potter%20and%20the%20Deathly%20Hallows:%20Part%201"
requests.get(url).json()

{'batchcomplete': '',
 'query': {'pages': {'17445028': {'pageid': 17445028,
    'ns': 0,
    'title': 'Harry Potter and the Deathly Hallows: Part 1'}}}}

In [26]:
def get_wikipedia_id_from_title(title):
    title += " film"
    title.replace(" ", "%20")
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&titles={title}"
    
    
    language_code = 'en'
    base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
    endpoint = '/search/page'
    url = base_url + language_code + endpoint
    parameters = {'q': title, 'limit': 1}
    page = requests.get(url, params=parameters).json().get('pages')
    id = page[0].get('id')
    return id

In [94]:
get_wikipedia_id_from_title("Superman IV: The Quest for Peace film")

407628

In [95]:
from more_itertools import sliced

sequel_collections = pd.read_csv('data/collections/sequels.csv')

tqdm.pandas()
sequel_collections = sequel_collections.drop("Wikipedia movie ID", axis=1)

slices = sliced(seq=range(len(sequel_collections)), n=50)
sequel_collections_with_wiki_id = pd.DataFrame()

i=0
for index in tqdm(slices, total=40):
    chunk = sequel_collections.iloc[index].copy()
    chunk["Wikipedia movie ID"] = chunk["title"].apply(get_wikipedia_id_from_title) # Apply function/transformation here
    sequel_collections_with_wiki_id = pd.concat([sequel_collections_with_wiki_id, chunk], axis=0, ignore_index=True, sort=False)
    chunk.to_csv(f'df{i}.csv')
    i+=1

100%|██████████| 40/40 [21:14<00:00, 31.87s/it]


In [64]:
sequel_collections_with_wiki_id["Wikipedia movie ID"] = sequel_collections_with_wiki_id["Wikipedia movie ID"][:-2]

In [76]:
sequel_collections_with_wiki_id = pd.read_csv('data/collections/sequels_with_wiki_id.csv')

In [77]:
sequel_collections_with_wiki_id["Wikipedia movie ID"] = sequel_collections_with_wiki_id["Wikipedia movie ID"].apply(lambda x: int(str(x)[:-2]))

ValueError: invalid literal for int() with base 10: 'n'

In [73]:
movie_df = pd.read_csv('data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)

movie_df.rename(columns={0: 'Wikipedia movie ID', 1: "Freebase movie ID", 2: "Movie name",  3: "Movie release date", 4: "Movie box office revenue", 5: "Movie runtime", 6: "Movie languages", 7: "Movie countries", 8: "Movie genres"}, inplace=True)

movie_df_sequel = movie_df.join(sequel_collections_with_wiki_id.set_index('Wikipedia movie ID'), on="Wikipedia movie ID", how='inner')

ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat