### Here our Results

In [None]:
import numpy as np
import pandas as pd
import json
import sys
import os
import requests

from tqdm import tqdm

from src.data.TMDB_Movies import get_data
from src.data.TMDB_Movies import get_collection

current_dir = os.path.abspath("")
sys.path.append(current_dir)
data_dir = os.path.join(current_dir, "data")

In [None]:
#we want to organize the data in tvclusters into a dataframe and sort by actor


# Read the text file into a DataFrame
tv_tropes_path = os.path.join(data_dir, "MovieSummaries/tvtropes.clusters.txt")
tv_clusters = pd.read_csv(tv_tropes_path, sep='\t', header=None, names=['trope', 'details'])

# Parse the JSON string in the 'details' column and expand it into separate columns
tv_clusters = tv_clusters.join(tv_clusters['details'].apply(json.loads).apply(pd.Series))

# Drop the 'details' column as it's no longer needed
tv_clusters.drop(columns=['details'], inplace=True)

# Display the DataFrame

tv_clusters_char = tv_clusters.sort_values(by='char')

tv_clusters_char = tv_clusters_char.reset_index(drop=True)

len(tv_clusters_char)


In [None]:
# try to get the movies with sequels 

char_count = tv_clusters_char.groupby('char').size()

multiple_char = char_count[char_count > 1]

print(multiple_char.head(5))
print("number of reoccuring character : " + str(len(multiple_char)))


In [None]:
tv_clusters_char = tv_clusters_char[tv_clusters_char['char'].isin(multiple_char.index)]

tv_clusters_char = tv_clusters_char.reset_index(drop=True)
tv_clusters_char.head(25)


In [None]:
#get only movies with sequels 
sequel_char = tv_clusters_char.groupby('char')['movie'].nunique()

sequel_char = sequel_char[sequel_char > 1]  

tv_clusters_sequel = tv_clusters_char[tv_clusters_char['char'].isin(sequel_char.index)]


tv_clusters_sequel = tv_clusters_sequel.reset_index(drop=True)


In [None]:
tv_clusters_sequel.head(35)

In [None]:
sequels_movies = pd.read_csv('/Users/christmannjules/Downloads/sequel_film.csv')

movie_columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue', 'Movie runtime', 'Movie languages', 'Movie countries', 'Movie genres']
movie_df = pd.read_csv('data/MovieSummaries/movie.metadata.tsv', sep='\t',names = movie_columns, header=None)



# Drop the 'details' column as it's no longer needed

sequel_df = movie_df[movie_df['Movie name'].isin(sequels_movies['Title'])]

sequel_df.sort_values(by=['Movie name'], inplace=True)

sequel_df.reset_index(drop=True, inplace=True)

sequel_df['Movie languages'] = sequel_df['Movie languages'].str.extract(r':\s*"([^"]+)"')
sequel_df['Movie countries'] = sequel_df['Movie countries'].str.extract(r':\s*"([^"]+)"')
sequel_df['genre_list'] = sequel_df['Movie genres'].str.findall(r':\s*"([^"]+)"')

# Expand the genre_list column into separate columns for each genre
genre_columns = sequel_df['genre_list'].apply(pd.Series)
genre_columns.columns = [f'Genre_{i+1}' for i in genre_columns.columns]

# Concatenate original DataFrame with new genre columns
sequel_df = pd.concat([sequel_df, genre_columns], axis=1).drop(columns=['genre_list'])
sequel_df.drop(columns=['Movie genres'], inplace=True)  

# Display the updated DataFrame
sequel_df.head(5)



### 2.1 Data Collection

Here we download the data from TMDB.

In [None]:
from src.data.TMDB_Movies import get_movie_data_extended, get_movie_metadatalike_db

keywords_name = ["sequels", "book", "comics", "remake"]
keywords = [9663, 818, 9717, 9714] #keywords for the movies corresponding to the sequels, book or novel adaptations, and based on comics, and remakes

start_date = "1880-01-01"
end_date = "2010-01-01"
for i, keyword in enumerate(keywords):
    data = get_data(keywords_name[i], "2010-01-01", keyword)
    file_name = f"{keywords_name[i]}_{start_date[:4]}_{end_date[:4]}.csv"
    data_extended = get_movie_data_extended(data, keywords_name[i])
    get_movie_metadatalike_db(data_extended, keywords_name[i])

### 2.1.2 Data Collection for the years 2010-2024

In [None]:
start_date = "2010-01-01"
end_date = "2024-01-01"

for i, keyword in enumerate(keywords):
    data = get_data(keywords_name[i], start_date, keyword)
    file_name = f"{keywords_name[i]}_{start_date[:4]}_{end_date[:4]}.csv"
    data_extended = get_movie_data_extended(data, keywords_name[i])
    get_movie_metadatalike_db(data_extended, keywords_name[i])

### 2.2 Use TMDB collection to get group the sequels together.

In [None]:
sequels_movies = pd.read_csv('data/sequels/movie_with_sequels_1880_2010.csv')
get_collection(sequels_movies, path= "data/sequels", years="1880_2010")

sequels_movies_new = pd.read_csv('data/sequels/movie_with_sequels_2010_2024.csv')
get_collection(sequels_movies_new, path="data/sequels", years="2010_2024")

### 2.3 Find the movies in the Dataset using TMDB collection

In [None]:
from more_itertools import sliced
from src.data.TMDB_Movies import get_wikipedia_id_from_title

def get_wikipedia_id_for_db(df, file):
    wiki_df = None
    slices = sliced(seq=range(len(df)), n=50)
    
    for index in tqdm(slices, total=len(df)//50):
        chunk = df.iloc[index].copy()
        chunk["Wikipedia movie ID"] = chunk.apply(lambda x: get_wikipedia_id_from_title(x["title"], x["release_date"]), axis = 1)
        wiki_df = pd.concat([wiki_df, chunk], axis=0, ignore_index=True,
                                                sort=False) if wiki_df is not None else chunk
        wiki_df.to_csv(file)
    return wiki_df

In [None]:
sequel_collections = pd.read_csv('data/collections/sequels.csv')
sequel_collections_with_wiki_id = get_wikipedia_id_for_db(sequel_collections, "data/collections/sequels_and_original_with_wiki_id.csv")
sequel_collections_with_wiki_id.to_csv('data/collections/sequels_and_original_with_wiki_id.csv')

In [None]:
sequels = pd.read_csv('data/movie_with_keyword_sequel.csv')
sequels_with_wiki_id = get_wikipedia_id_for_db(sequels, 'data/movie_with_keyword_sequel_with_wiki_id.csv')
sequels_with_wiki_id.to_csv('data/movie_with_keyword_sequel_with_wiki_id.csv')

In [None]:
book = pd.read_csv("data/book/movie_with_book_1880_2010.csv")
book_with_wiki_id = get_wikipedia_id_for_db(book, 'data/book/book_with_wiki_id_1880_2010.csv')
book_with_wiki_id.to_csv('data/book/book_with_wiki_id_1880_2010.csv')

In [None]:
comics = pd.read_csv("data/comics/movie_with_comics_1880_2010.csv")
comics_with_wiki_id = get_wikipedia_id_for_db(comics, 'data/comics/comics_with_wiki_id_1880_2010.csv')
comics_with_wiki_id.to_csv('data/comics/comics_with_wiki_id_1880_2010.csv')

In [None]:
remake = pd.read_csv("data/remake/movie_with_remake_1880_2010.csv")
remake_with_wiki_id = get_wikipedia_id_for_db(remake, 'data/remake/remake_with_wiki_id.csv')
remake_with_wiki_id.to_csv('data/remake/remake_with_wiki_id.csv')

#### 2.3.2 Find the movies in the Dataset using Wikipedia movie ID

In [None]:
movie_df = pd.read_csv('data/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
sequel_collections_with_wiki_id = pd.read_csv('data/collections/sequels_and_original_with_wiki_id.csv')

movie_df.rename(columns={0: 'Wikipedia movie ID', 1: "Freebase movie ID", 2: "Movie name",  3: "Movie release date", 4: "Movie box office revenue", 5: "Movie runtime", 6: "Movie languages", 7: "Movie countries", 8: "Movie genres"}, inplace=True)

movie_df_sequel_original = movie_df.join(sequel_collections_with_wiki_id.set_index('Wikipedia movie ID'), on="Wikipedia movie ID", how='inner')
movie_df_sequel_original.to_csv('data/movie_df_sequel_original.csv')

In [None]:
sequels_with_wiki_id = pd.read_csv('data/movie_with_keyword_sequel_with_wiki_id.csv')

movie_df_sequel_only = movie_df.join(sequels_with_wiki_id.set_index('Wikipedia movie ID'), on="Wikipedia movie ID", how='inner')
movie_df_sequel_only.to_csv('data/movie_df_sequel_only.csv')

In [None]:
movie_df_book = movie_df.join(book_with_wiki_id.set_index('Wikipedia movie ID'), on="Wikipedia movie ID", how='inner')
movie_df_book.to_csv('data/movie_df_book.csv')

movie_df_comics = movie_df.join(comics_with_wiki_id.set_index('Wikipedia movie ID'), on="Wikipedia movie ID", how='inner')
movie_df_comics.to_csv('data/movie_df_comics.csv')

movie_df_remake = movie_df.join(remake_with_wiki_id.set_index('Wikipedia movie ID'), on="Wikipedia movie ID", how='inner')
movie_df_remake.to_csv('data/movie_df_remake.csv')

#### 2.4 Remove movies without the same year between tmdb and wikipedia

In [None]:
def ensure_same_year(df):
    df["release year wiki"] = df["Movie release date"].apply(lambda x: str(x)[:4] if str.isdigit(str(x)[:4]) else np.nan)
    df["release year tmdb"] = df["release_date"].apply(lambda x: str(x)[:4] if str.isdigit(str(x)[:4]) else np.nan)
    
    df.drop(df[df["release year wiki"] != df["release year tmdb"]].index, inplace=True)
    df["release year"] = df["release year wiki"].astype(float)
    df.drop("release year tmdb", axis=1, inplace=True)
    df.drop("release year wiki", axis=1, inplace=True)
    
    return df
    

In [None]:
movie_df_sequel_only = ensure_same_year(movie_df_sequel_only)
movie_df_sequel_only.to_csv('data/movie_df_sequel_only.csv')

movie_df_sequel_original = ensure_same_year(movie_df_sequel_original)
movie_df_sequel_original.to_csv('data/movie_df_sequel_original.csv')

movie_df_book = ensure_same_year(movie_df_book)
movie_df_book.to_csv('data/movie_df_book.csv')

movie_df_comics = ensure_same_year(movie_df_comics)
movie_df_comics.to_csv('data/movie_df_comics.csv')

movie_df_remake = ensure_same_year(movie_df_remake)
movie_df_remake.to_csv('data/movie_df_remake.csv')

In [None]:
from src.data.TMDB_Movies import fill_missing_value

movie_extended_df = pd.read_csv("data/collections/sequels_extended_1880_2010.csv")
movie_df_sequel_original.apply(lambda x : fill_missing_value(x, movie_extended_df[movie_extended_df["id"] == x["id"]], "Movie box office revenue","revenue"), axis=1)

sequels_only_extended_df = pd.read_csv("data/sequels/sequels_extended.csv")
movie_df_sequel_only.apply(lambda x : fill_missing_value(x, sequels_only_extended_df[sequels_only_extended_df["id"] == x["id"]], "Movie box office revenue","revenue"), axis=1)