# CMU Movie Data Loading

## Imports

In [1]:
import pandas as pd
from wikimapper import WikiMapper
import re
import csv
import json
import ast
import numpy as np

from ada_config.config import CONFIG

## Load CMU Movie Data:

In [2]:
meta_data_path = CONFIG["cmu_path"] / "movie.metadata.tsv"
column_names = [
    "Wikipedia_movie_ID",
    "Freebase_movie_ID",
    "movie_name",
    "movie_year",
    "movie_revenue",
    "movie_runtime",
    "movie_languages",
    "movie_countries",
    "movie_genres",
]

In [3]:
meta_df = pd.read_csv(meta_data_path, sep="\t", header=None, names=column_names)

## Remove the Inner Data Structures

Processing the Language, Countries, Genres columns to have them in a cleaner way.

In [6]:
def convert_string_to_dict(dict_str):
    """
    Attempts to convert a string to a dictionary using json.loads.
    If it fails, tries ast.literal_eval.
    """
    try:
        # First, try using json.loads
        return json.loads(dict_str)
    except json.JSONDecodeError:
        try:
            return ast.literal_eval(dict_str)
        except (ValueError, SyntaxError) as e:
            print("Failed to convert string to dictionary:", e)
            return None


def map_codes_to_items(list_of_dicts):
    """
    Maps a list of dictionaries to a list of items.
    """

    map_codes_to_something = {}
    for d in list_of_dicts:
        for k, v in d.items():
            map_codes_to_something[k] = v
    map_codes_to_something["nan"] = None
    return map_codes_to_something

In [7]:
list_countries_codes = []
for s in np.array(meta_df["movie_countries"]):
    list_countries_codes.append(convert_string_to_dict(s))
list_countries = [tuple(sublist.values()) for sublist in list_countries_codes]

In [8]:
list_languages_codes = []
for s in np.array(meta_df["movie_languages"]):
    list_languages_codes.append(convert_string_to_dict(s))
list_languages = [set(sublist.values()) for sublist in list_languages_codes]

list_genres_codes = []
for s in np.array(meta_df["movie_genres"]):
    list_genres_codes.append(convert_string_to_dict(s))
list_genres = [set(sublist.values()) for sublist in list_genres_codes]

In [9]:
countries = [", ".join(map(str, t)) for t in list_countries]
meta_df["movie_countries"] = countries

languages = pd.DataFrame(
    [
        ", ".join(list_languages) if list_languages else None
        for list_languages in list_languages
    ]
)
meta_df["movie_languages"] = languages

genres = pd.DataFrame([", ".join(genres) if genres else None for genres in list_genres])
meta_df["movie_genres"] = genres

Manual change of a movie with incorrect movie year.

In [10]:
meta_df.loc[meta_df["movie_name"] == "Hunting Season", "movie_year"] = "2010-12-02"

## Add Wikidata IDs to the CMU Movie Data:

In [11]:
wiki_ids = meta_df["Wikipedia_movie_ID"].unique().tolist()
mapper = WikiMapper(CONFIG["data_path"] / "index_enwiki-20190420.db")

In [12]:
def convert_wikipedia_id_to_wikidata_id(id_):
    wikidata_id = mapper.wikipedia_id_to_id(id_)
    return wikidata_id


meta_df["wikidata_id"] = meta_df["Wikipedia_movie_ID"].apply(
    convert_wikipedia_id_to_wikidata_id
)

In [18]:
meta_df.drop_duplicates(subset=["wikidata_id"], keep=False, inplace=True)

In [19]:
meta_df.to_csv(CONFIG["cmu_path"] / "movie_metadata_wikidata.csv", index=False)

## Add plot summaries to the CMU Movie Data:

In [20]:
def convert_txt_to_csv(input_file_path, output_file_path):
    """
    Converts a text file to a CSV file with two columns: movie_id and movie_summary.
    The input file should have each line with movie_id and movie_summary separated by a tab or space.

    Parameters:
    - input_file_path: Path to the input text file.
    - output_file_path: Path where the output CSV file will be saved.
    """
    with open(input_file_path, "r", encoding="utf-8") as fin, open(
        output_file_path, "w", newline="", encoding="utf-8"
    ) as fout:
        writer = csv.writer(fout)
        writer.writerow(["Wikipedia_movie_ID", "movie_summary"])
        for line_number, line in enumerate(fin, start=1):
            line = line.strip()

            if not line:
                print(f"Skipping empty line at line number {line_number}.")
                continue

            split_result = re.split(r"\t| ", line, maxsplit=1)

            if len(split_result) == 2:
                movie_id, movie_summary = split_result
            elif len(split_result) == 1:
                movie_id = split_result[0]
                movie_summary = ""
                print(
                    f"No summary found for movie_id '{movie_id}' at line number {line_number}."
                )
            else:
                movie_id = ""
                movie_summary = ""
                print(f"Unexpected format at line number {line_number}: '{line}'")

            writer.writerow([movie_id, movie_summary])


convert_txt_to_csv(
    CONFIG["cmu_path"] / "plot_summaries.txt", CONFIG["cmu_path"] / "plot_summaries.csv"
)

## Characters meta data

In [21]:
column_names_char = [
    "Wikipedia_movie_ID",
    "Freebase_movie",
    "movie_release_date",
    "character_name",
    "actor_DOB",
    "gender",
    "height",
    "ethnicity",
    "actor_name",
    "actor_age",
    "freebase_map",
    "1",
    "2",
]
tsv_file_char = CONFIG["cmu_path"] / "character.metadata.tsv"
df_chars = pd.read_table(tsv_file_char, sep="\t", names=column_names_char)
df_chars.to_csv(CONFIG["cmu_path"] / "character_metadata.csv", index=False)