### Here our Results

In [255]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import sys
import os
import requests
import swifter

from tqdm import tqdm
from pygments.lexers.webassembly import keywords

from src.data.TMDB_Movies import get_data
from src.data.TMDB_Movies import get_collection

current_dir = os.path.abspath("")
sys.path.append(current_dir)
data_dir = os.path.join(current_dir, "data")

In [256]:
#we want to organize the data in tvclusters into a dataframe and sort by actor


# Read the text file into a DataFrame
tv_tropes_path = os.path.join(data_dir, "MovieSummaries/tvtropes.clusters.txt")
tv_clusters = pd.read_csv(tv_tropes_path, sep='\t', header=None, names=['trope', 'details'])

# Parse the JSON string in the 'details' column and expand it into separate columns
tv_clusters = tv_clusters.join(tv_clusters['details'].apply(json.loads).apply(pd.Series))

# Drop the 'details' column as it's no longer needed
tv_clusters.drop(columns=['details'], inplace=True)

# Display the DataFrame

tv_clusters_char = tv_clusters.sort_values(by='char')

tv_clusters_char = tv_clusters_char.reset_index(drop=True)

len(tv_clusters_char)


501

In [257]:
# try to get the movies with sequels 

char_count = tv_clusters_char.groupby('char').size()

multiple_char = char_count[char_count > 1]

print(multiple_char.head(5))
print("number of reoccuring character : " + str(len(multiple_char)))


char
Anakin Skywalker         4
Arthur Burns             2
Biff Tannen              2
Bill                     3
Bill S. Preston, Esq.    2
dtype: int64
number of reoccuring character : 51


In [258]:
tv_clusters_char = tv_clusters_char[tv_clusters_char['char'].isin(multiple_char.index)]

tv_clusters_char = tv_clusters_char.reset_index(drop=True)
tv_clusters_char.head(25)


Unnamed: 0,trope,char,movie,id,actor
0,arrogant_kungfu_guy,Anakin Skywalker,Star Wars Episode III: Revenge of the Sith,/m/02sg5ry,Hayden Christensen
1,child_prodigy,Anakin Skywalker,Star Wars Episode I: The Phantom Menace,/m/02sg5rn,Jake Lloyd
2,byronic_hero,Anakin Skywalker,Star Wars Episode III: Revenge of the Sith,/m/02sg5ry,Hayden Christensen
3,gadgeteer_genius,Anakin Skywalker,Star Wars Episode I: The Phantom Menace,/m/02sg5rn,Jake Lloyd
4,warrior_poet,Arthur Burns,The Proposition,/m/0k66rt,Danny Huston
5,cultured_badass,Arthur Burns,The Proposition,/m/0k66rt,Danny Huston
6,jerk_jock,Biff Tannen,Back to the Future,/m/0jty60,Thomas F. Wilson
7,bully,Biff Tannen,Back to the Future,/m/0jty60,Thomas F. Wilson
8,crazy_jealous_guy,Bill,Kill Bill Volume 1,/m/0j_fvv,David Carradine
9,heartbroken_badass,Bill,Kill Bill Volume 1,/m/0j_fvv,David Carradine


In [259]:
#get only movies with sequels 
sequel_char = tv_clusters_char.groupby('char')['movie'].nunique()

sequel_char = sequel_char[sequel_char > 1]  

tv_clusters_sequel = tv_clusters_char[tv_clusters_char['char'].isin(sequel_char.index)]


tv_clusters_sequel = tv_clusters_sequel.reset_index(drop=True)


In [260]:
tv_clusters_sequel.head(35)

Unnamed: 0,trope,char,movie,id,actor
0,arrogant_kungfu_guy,Anakin Skywalker,Star Wars Episode III: Revenge of the Sith,/m/02sg5ry,Hayden Christensen
1,child_prodigy,Anakin Skywalker,Star Wars Episode I: The Phantom Menace,/m/02sg5rn,Jake Lloyd
2,byronic_hero,Anakin Skywalker,Star Wars Episode III: Revenge of the Sith,/m/02sg5ry,Hayden Christensen
3,gadgeteer_genius,Anakin Skywalker,Star Wars Episode I: The Phantom Menace,/m/02sg5rn,Jake Lloyd
4,dumb_blonde,Cindy Campbell,Scary Movie 3,/m/0k2z05,Anna Faris
5,dumb_blonde,Cindy Campbell,Scary Movie 4,/m/0k2y_7,Anna Faris
6,corrupt_corporate_executive,Cutler Beckett,Pirates of the Caribbean: Dead Man's Chest,/m/0k1xyw,Tom Hollander
7,corrupt_corporate_executive,Cutler Beckett,Pirates of the Caribbean: At World's End,/m/0k1xwx,Tom Hollander
8,charmer,Han Solo,Star Wars Episode IV: A New Hope,/m/0k3r1_,Harrison Ford
9,loveable_rogue,Han Solo,Star Wars Episode IV: A New Hope,/m/0k3r1_,Harrison Ford


In [261]:
sequels_movies = pd.read_csv('/Users/christmannjules/Downloads/sequel_film.csv')

movie_columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres']
movie_df = pd.read_csv('data/MovieSummaries/movie.metadata.tsv', sep='\t',names = movie_columns, header=None)



# Drop the 'details' column as it's no longer needed

sequel_df = movie_df[movie_df['Movie name'].isin(sequels_movies['Title'])]

sequel_df.sort_values(by=['Movie name'], inplace=True)

sequel_df.reset_index(drop=True, inplace=True)

sequel_df['Movie languages'] = sequel_df['Movie languages'].str.extract(r':\s*"([^"]+)"')
sequel_df['Movie countries'] = sequel_df['Movie countries'].str.extract(r':\s*"([^"]+)"')
sequel_df['genre_list'] = sequel_df['Movie genres'].str.findall(r':\s*"([^"]+)"')

# Expand the genre_list column into separate columns for each genre
genre_columns = sequel_df['genre_list'].apply(pd.Series)
genre_columns.columns = [f'Genre_{i+1}' for i in genre_columns.columns]

# Concatenate original DataFrame with new genre columns
sequel_df = pd.concat([sequel_df, genre_columns], axis=1).drop(columns=['genre_list'])
sequel_df.drop(columns=['Movie genres'], inplace=True)  

# Display the updated DataFrame
sequel_df.head(5)



FileNotFoundError: [Errno 2] No such file or directory: '/Users/christmannjules/Downloads/sequel_film.csv'

### 2.1 Data Collection

Here we download the data from TMDB.

In [None]:
keywords_name = ["sequel", "book", "comic", "remake"]
keywords = [9663, 818, 9717, 9714] #keywords for the movies corresponding to the sequels, book or novel adaptations, and based on comics, and remakes

for i, keyword in enumerate(keywords) :
    get_data(keywords_name[i], "2013-01-01", keyword)

### 2.2 Use TMDB collection to get group the sequels together.

In [None]:
sequels_movies = pd.read_csv('data/movie_with_keyword_sequel.csv')
get_collection(sequels_movies)


### 2.3 Find the movies in the Dataset using TMDB collection

In [None]:
url = "https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&format=json&titles=Harry%20Potter%20and%20the%20Deathly%20Hallows:%20Part%201"
requests.get(url).json()

In [4]:
from src.data.TMDB_Movies import get_wikipedia_id_from_title
from more_itertools import sliced

sequel_collections = pd.read_csv('data/collections/sequels.csv')
sequel_collections = sequel_collections.drop("Wikipedia movie ID", axis=1)

slices = sliced(seq=range(len(sequel_collections)), n=50)
sequel_collections_with_wiki_id = pd.DataFrame()

i = 0
for index in tqdm(slices, total=40):
    chunk = sequel_collections.iloc[index].copy()
    chunk["Wikipedia movie ID"] = chunk.apply(lambda x: get_wikipedia_id_from_title(x["title"], x["release_date"]), axis = 1)
    sequel_collections_with_wiki_id = pd.concat([sequel_collections_with_wiki_id, chunk], axis=0, ignore_index=True,
                                                sort=False) if sequel_collections_with_wiki_id is not None else chunk
    i += 1
    
sequel_collections_with_wiki_id.to_csv('data/collections/sequels_and_original_with_wiki_id.csv')

100%|██████████| 40/40 [24:37<00:00, 36.95s/it]


In [33]:
sequels = pd.read_csv('data/movie_with_keyword_sequel.csv')

slices = sliced(seq=range(len(sequels)), n=50)
sequels_with_wiki_id = pd.DataFrame()

i = 0

for index in tqdm(slices, total=17):
    chunk = sequels.iloc[index].copy()
    chunk["Wikipedia movie ID"] = chunk.apply(lambda x: get_wikipedia_id_from_title(x["title"], x["release_date"]), axis = 1)
    sequels_with_wiki_id = pd.concat([sequels_with_wiki_id, chunk], axis=0, ignore_index=True,
                                                sort=False) if sequels_with_wiki_id is not None else chunk
    i += 1
sequels_with_wiki_id.to_csv('data/movie_with_keyword_sequel_with_wiki_id.csv')

 42%|████▎     | 17/40 [27:12<36:49, 96.04s/it]  


In [6]:
sequel_collections_with_wiki_id["Wikipedia movie ID"] = sequel_collections_with_wiki_id["Wikipedia movie ID"].astype(float).astype('Int64')

#### 2.3.2 Find the movies in the Dataset using Wikipedia movie ID

In [73]:
movie_df = pd.read_csv('data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
sequel_collections_with_wiki_id = pd.read_csv('data/collections/sequels_and_original_with_wiki_id.csv')

movie_df.rename(columns={0: 'Wikipedia movie ID', 1: "Freebase movie ID", 2: "Movie name",  3: "Movie release date", 4: "Movie box office revenue", 5: "Movie runtime", 6: "Movie languages", 7: "Movie countries", 8: "Movie genres"}, inplace=True)

movie_df_sequel_original = movie_df.join(sequel_collections_with_wiki_id.set_index('Wikipedia movie ID'), on="Wikipedia movie ID", how='inner')
movie_df_sequel_original.to_csv('data/movie_df_sequel_original.csv')

In [74]:
sequels_with_wiki_id = pd.read_csv('data/movie_with_keyword_sequel_with_wiki_id.csv')

movie_df_sequel_only = movie_df.join(sequels_with_wiki_id.set_index('Wikipedia movie ID'), on="Wikipedia movie ID", how='inner')
movie_df_sequel_only.to_csv('data/movie_df_sequel_only.csv')

#### 2.4 Remove movies without the same year between tmdb and wikipedia

In [78]:
movie_df_sequel_only["release year wiki"] = movie_df_sequel_only["Movie release date"].apply(lambda x: str(x)[:4] if str.isdigit(str(x)[:4]) else np.nan)
movie_df_sequel_only["release year tmdb"] = movie_df_sequel_only["release_date"].apply(lambda x: str(x)[:4] if str.isdigit(str(x)[:4]) else np.nan)

movie_df_sequel_only.drop(movie_df_sequel_only[movie_df_sequel_only["release year wiki"] != movie_df_sequel_only["release year tmdb"]].index, inplace=True)
movie_df_sequel_only["release year"] = movie_df_sequel_only["release year wiki"].astype(float)
movie_df_sequel_only.drop("release year tmdb", axis=1, inplace=True)
movie_df_sequel_only.drop("release year wiki", axis=1, inplace=True)


In [79]:
movie_df_sequel_original["release year wiki"] = movie_df_sequel_original["Movie release date"].apply(lambda x: str(x)[:4] if str.isdigit(str(x)[:4]) else np.nan)
movie_df_sequel_original["release year tmdb"] = movie_df_sequel_original["release_date"].apply(lambda x: str(x)[:4] if str.isdigit(str(x)[:4]) else np.nan)

movie_df_sequel_original.drop(movie_df_sequel_original[movie_df_sequel_original["release year wiki"] != movie_df_sequel_original["release year tmdb"]].index, inplace=True)
movie_df_sequel_original["release year"] = movie_df_sequel_original["release year wiki"].astype(float)
movie_df_sequel_original.drop("release year tmdb", axis=1, inplace=True)
movie_df_sequel_original.drop("release year wiki", axis=1, inplace=True)