### PURPOSE
This notebook is used to merge the wikidata dataset with the original dataset

In [7]:
import pandas as pd
import numpy as np
import json
import requests
import matplotlib.pyplot as plt
import ast

PATH = './data'

### UTILS

In [5]:
# Function to convert currency using exchange rate json file
def convert_to_usd(amount, currency):

    with open(f"{PATH}/usd_conversion_rates.json", 'r') as file:
        conversion_rates = json.load(file)

    rate = conversion_rates.get(currency, 1)  # Default to 1 if currency not found
    return float(amount) / float(rate)


def get_wikipedia_page_id(film_name):
    # Replace spaces with underscores for the Wikipedia API format
    film_name_formatted = film_name.replace(' ', '_')

    url = f"https://en.wikipedia.org/w/api.php"

    params = {
        'action': 'query',
        'format': 'json',
        'titles': film_name_formatted
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        try:
            data = response.json()
            page_id = next(iter(data['query']['pages']))
            return page_id
        except KeyError:
            print(f"No page found for {film_name}")
            return None
    else:
        print("Failed to fetch data from Wikipedia")
        return None


### DATA PROCESSING AND CURRENCY CONVERSION

In [None]:
# Read the data into a DataFrame
df = pd.read_csv(f"{PATH}/film_wiki_data.csv")

# Group by 'filmLabel' and keep the row with the max 'boxOffice' for each group since some films have multiple rows with box office per country
df = df.loc[df.groupby('filmLabel')['boxOffice'].idxmax()]
df = df.reset_index(drop=True)


def convert_row(row):
    if row['currencyCode'] != 'USD':
        row['cost'] = convert_to_usd(row['cost'], row['currencyCode'])
        row['boxOffice'] = convert_to_usd(row['boxOffice'], row['currencyCode'],)
        row['currencyCode'] = 'USD'

        # Fetch and add the Wikipedia page ID
    row['wikiPageID'] = get_wikipedia_page_id(row['filmLabel'])
    row['costBoxOfficeMultiplier'] = row['boxOffice'] / row['cost']

    return row

# Apply the conversion to each row
df = df.apply(convert_row, axis=1)

# remove if page id is not defined
df = df[df['wikiPageID'] != -1]
# save csv without row index
df.to_csv(f'{PATH}/film_wiki_data_formatted.csv', index=False)

### MERGE DATASETS

In [None]:
# Load the first CSV file
df1 = pd.read_csv(f'{PATH}/film_wiki_data_formatted.csv')

# Load the second CSV file
df2 = pd.read_csv(f"{PATH}/movie.metadata.tsv", names=[
    "wikiPageID",
    "freebaseMovieID",
    "movieName",
    "movieReleaseDate",
    "movieBoxOfficeRevenue",
    "movieRuntime",
    "movieLanguages",
    "movieCountries",
    "movieGenres"
], header=None, sep="\t")


merged_df = pd.merge(df1, df2, on='wikiPageID', how='inner')

merged_df.to_csv(f'{PATH}/merged_movie_data.csv', index=False)

