In [45]:
import pandas as pd

file_path = 'plot_summaries.txt'
output_file_path = 'filtered_plot_summaries.tsv'
data = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            movie_id, summary = line.split('\t', 1)  
            summary = summary.strip()  
            
            if len(summary) >= 1000:
                last_400_characters = summary[-400:].strip()
                data.append({'Movie_ID': movie_id, 'Summary': last_400_characters})
                
        except ValueError:
            print(f"Ligne ignorée (mauvais format) : {line}")

df = pd.DataFrame(data)
df.to_csv(output_file_path, sep='\t', index=False)

print(f"Fichier TSV créé avec succès : {output_file_path}")

Fichier TSV créé avec succès : filtered_plot_summaries.tsv


In [46]:
import pandas as pd
from textblob import TextBlob

metadata_path = 'movie.metadata.tsv'
movie_data = pd.read_csv(metadata_path, sep='\t', header=None, dtype={0: str})  # Charger l'ID comme chaîne
movie_data.columns = ['Movie_ID', 'Other_Column', 'Title', 'Release_Date', 'Revenue', 'Runtime', 'Languages', 'Country', 'Genres']
summaries_path = 'filtered_plot_summaries.tsv'
summaries_data = pd.read_csv(summaries_path, sep='\t', dtype={'Movie_ID': str})

def analyze_sentiment(summary):
    analysis = TextBlob(summary)
    polarity = analysis.sentiment.polarity
    if polarity > 0.5:
        return 5  # Very happy ending
    elif 0.13 < polarity <= 0.5:
        return 4  # Happy ending
    elif -0.13 <= polarity <= 0.13:
        return 3  # Neutral ending
    elif -0.5 < polarity < -0.13:
        return 2  # Sad ending
    else:
        return 1  # Very sad ending

merged_data = pd.merge(movie_data, summaries_data, on='Movie_ID', how='inner')


merged_data['Score'] = merged_data['Summary'].apply(analyze_sentiment)

output_file_path = 'movies_dataset_final.tsv'
merged_data.to_csv(output_file_path, sep='\t', index=False)


In [38]:
import requests
from tqdm import tqdm
import os
import pickle
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

# API Key and Base URL setup
API_KEY = '9923aaa2a3b2777bfdeba7f76c97d212'
BASE_SEARCH_URL = 'https://api.themoviedb.org/3/search/movie'
BASE_MOVIE_URL = 'https://api.themoviedb.org/3/movie'

# Load the existing movie dataset
file_path = 'movies_dataset_final.tsv'
movies_df = pd.read_csv(file_path, sep='\t')

# Define function to fetch movie data from TMDB API
def get_movie_data_from_tmdb(wikipedia_id, title):
    params = {
        'api_key': API_KEY,
        'query': title,
        'language': 'en-US'
    }
    response = requests.get(BASE_SEARCH_URL, params=params)
    if response.status_code == 200:
        data = response.json()
        if data['results']:
            movie_data = data['results'][0]
            overview = movie_data.get('overview', '')
            tmdb_id = movie_data.get('id', None)
            return wikipedia_id, {"overview": overview, "tmdb_id": tmdb_id}
    return wikipedia_id, {}

# Define functions to get specific movie details and credits
def get_movie_details(wikipedia_id, tmdb_id):
    response = requests.get(f"{BASE_MOVIE_URL}/{tmdb_id}", params={'api_key': API_KEY, 'language': 'en-US'})
    return (wikipedia_id, "details", response.json()) if response.status_code == 200 else (wikipedia_id, "details", {})

def get_movie_credits(wikipedia_id, tmdb_id):
    response = requests.get(f"{BASE_MOVIE_URL}/{tmdb_id}/credits", params={'api_key': API_KEY})
    return (wikipedia_id, "credits", response.json()) if response.status_code == 200 else (wikipedia_id, "credits", {})

# Load existing TMDB data if available
DATA_FOLDER = '.'
if os.path.exists(f'{DATA_FOLDER}/movie_data_from_tmdb.pkl'):
    with open(f'{DATA_FOLDER}/movie_data_from_tmdb.pkl', 'rb') as file:
        movie_data_from_tmdb = pickle.load(file)
else:
    movie_data_from_tmdb = {}

# Fetch missing TMDB data
movies_to_process = [
    (wiki_id, title) for wiki_id, title in zip(movies_df['Other_Column'], movies_df['Title'])
    if wiki_id not in movie_data_from_tmdb
]
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(get_movie_data_from_tmdb, movie_id, title): movie_id for movie_id, title in movies_to_process}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching TMDB IDs"):
        wikipedia_id, movie_info = future.result()
        if movie_info:
            movie_data_from_tmdb[wikipedia_id] = movie_info


Fetching TMDB IDs: 100%|██████████| 638/638 [00:05<00:00, 109.47it/s]


In [39]:
# Save basic data to avoid re-fetching
with open(f'{DATA_FOLDER}/movie_data_from_tmdb_only_id.pkl', 'wb') as file:
    pickle.dump(movie_data_from_tmdb, file)

# Fetch additional details and credits
movies_to_process = [(wiki_id, info['tmdb_id']) for wiki_id, info in movie_data_from_tmdb.items() if info.get('tmdb_id') and 'details' not in info]
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = []
    for wiki_id, tmdb_id in movies_to_process:
        futures.append(executor.submit(get_movie_details, wiki_id, tmdb_id))
        futures.append(executor.submit(get_movie_credits, wiki_id, tmdb_id))
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching Details and Credits"):
        wikipedia_id, data_type, data = future.result()
        if wikipedia_id in movie_data_from_tmdb:
            movie_data_from_tmdb[wikipedia_id][data_type] = data

# Save the enriched TMDB data
with open(f'{DATA_FOLDER}/movie_data_from_tmdb.pkl', 'wb') as file:
    pickle.dump(movie_data_from_tmdb, file)

# Define helper functions for new data fields
def find_director(movie_data):
    crew = movie_data.get('credits', {}).get('crew', [])
    for person in crew:
        if person.get('job') == 'Director':
            return person.get('name', pd.NA)
    return pd.NA


def find_vote_average(movie_data):
    return movie_data.get('details', {}).get('vote_average', pd.NA)

def find_revenue(movie_data):
    return movie_data.get('details', {}).get('revenue', pd.NA)

# Map new data to the movies_df dataset
movies_df['director'] = movies_df['Other_Column'].map(lambda x: find_director(movie_data_from_tmdb.get(x, {})))
movies_df['vote_average'] = movies_df['Other_Column'].map(lambda x: find_vote_average(movie_data_from_tmdb.get(x, {})))
movies_df['revenue'] = movies_df['Other_Column'].map(lambda x: find_revenue(movie_data_from_tmdb.get(x, {})))
movies_df = movies_df.drop(columns=['Revenue', 'Movie_ID_y'], errors='ignore')

# Save the updated dataset back to the original file
movies_df.to_csv(file_path, sep='\t', index=False)
print(f"Updated dataset saved back to '{file_path}' with new columns: director, vote_average, and revenue")



Fetching Details and Credits: 0it [00:00, ?it/s]


Updated dataset saved back to 'movies_dataset_final.tsv' with new columns: director, vote_average, and revenue


In [40]:
import pandas as pd

# Load the dataset
movies_df = pd.read_csv('movies_dataset_final.tsv', sep='\t')

# Helper function to clean up and extract country names
def extract_countries(country_data):
    if pd.isna(country_data) or '{' not in country_data:
        return None
    countries = []
    items = country_data.split(", ")
    for item in items:
        if ':' in item:
            # Clean up each country name
            country_name = item.split(":")[-1].strip().replace("\"", "").replace("}", "").replace("{", "")
            countries.append(country_name)
    # Join all countries with a comma and space
    return ", ".join(countries) if countries else None

# Apply the function to clean and reformat the 'Country' column
movies_df['Country'] = movies_df['Country'].apply(extract_countries)

# Drop rows with missing or invalid country data in 'Country'
movies_df = movies_df.dropna(subset=['Country']).copy()

# Save the updated dataset with the cleaned 'Country' column back to the original file
movies_df.to_csv('movies_dataset_final.tsv', sep='\t', index=False)
print("Dataset updated with cleaned 'Country' column and saved as 'movies_dataset_final.tsv'")


Dataset updated with cleaned 'Country' column and saved as 'movies_dataset_final.tsv'


In [41]:
import pandas as pd

# Load the dataset
movies_df = pd.read_csv('movies_dataset_final.tsv', sep='\t')

# Helper function to clean up and extract language names
def extract_languages(language_data):
    if pd.isna(language_data) or '{' not in language_data:
        return None
    languages = []
    items = language_data.split(", ")
    for item in items:
        if ':' in item:
            # Clean each language name and remove "Language"/"language"
            language_name = item.split(":")[-1].strip().replace("\"", "").replace("}", "").replace("Language", "").replace("language", "").strip()
            if len(language_name) <= 40:  # Filter out languages longer than 40 characters
                languages.append(language_name)
    # Join all languages with a comma and space
    return ", ".join(languages) if languages else None

# Apply the function to clean and reformat the 'Languages' column
movies_df['Languages'] = movies_df['Languages'].apply(extract_languages)

# Drop rows with missing or invalid language data in 'Languages'
movies_df = movies_df.dropna(subset=['Languages']).copy()

# Save the updated dataset with the cleaned 'Languages' column back to the original file
movies_df.to_csv('movies_dataset_final.tsv', sep='\t', index=False)
print("Dataset updated with cleaned 'Languages' column and saved as 'movies_dataset_finals.tsv'")


Dataset updated with cleaned 'Languages' column and saved as 'movies_dataset_finals.tsv'


In [43]:
import pandas as pd
import re

# Load the dataset
movies_df = pd.read_csv('movies_dataset_final.tsv', sep='\t')

# Helper function to clean up and extract genre names
def extract_genres(genre_data):
    if pd.isna(genre_data) or '{' not in genre_data:
        return None
    genres = []
    items = genre_data.split(", ")
    for item in items:
        if ':' in item:
            # Clean the genre name and remove unwanted words
            genre_name = item.split(":")[-1].strip().replace('"', '').replace('}', '')
            # Remove words like "Movie", "Movies", "Film", etc.
            genre_name = re.sub(r'\b(Movie|Movies|Film|Films|movie|movies|film|films)\b', '', genre_name).strip()
            genres.append(genre_name)
    # Join all genres with a comma and space
    return ", ".join(genres) if genres else None

# Apply the function to clean and reformat the 'Genres' column
movies_df['Genres'] = movies_df['Genres'].apply(extract_genres)

# Drop rows with missing or invalid genre data in 'Genres'
movies_df = movies_df.dropna(subset=['Genres']).copy()

# Save the updated dataset with the cleaned 'Genres' column back to the original file
movies_df.to_csv('movies_dataset_final.tsv', sep='\t', index=False)
print("Dataset updated with cleaned 'Genres' column and saved as 'movies_dataset_final.tsv'")


Dataset updated with cleaned 'Genres' column and saved as 'movies_dataset_final.tsv'


In [44]:
import pandas as pd
import re

# Load the dataset
movies_df = pd.read_csv('movies_dataset_final.tsv', sep='\t')

# Define a function to extract the 4-digit year from various date formats
def extract_year(date_str):
    # Ensure the date is a string
    date_str = str(date_str)
    
    # Use regex to find a 4-digit year pattern
    match = re.search(r'\b(\d{4})\b', date_str)
    
    if match:
        return match.group(1)  # Return the matched 4-digit year as a string
    else:
        return None  # Return None if no 4-digit year is found

# Apply the function to the 'Release_Date' column to extract only the year
movies_df['Release_Date'] = movies_df['Release_Date'].apply(extract_year)

# Drop rows with no valid year
movies_df = movies_df.dropna(subset=['Release_Date'])

# Convert 'Release_Date' to an integer type for further analysis
movies_df['Release_Date'] = movies_df['Release_Date'].astype(int)

# Save the cleaned dataset back to the original file
movies_df.to_csv('movies_dataset_final.tsv', sep='\t', index=False)
print("Dataset updated with cleaned 'Release_Date' years in 'movies_dataset_final.tsv'")


Dataset updated with cleaned 'Release_Date' years in 'movies_dataset_final.tsv'
