## CMU Movie Data Loading

### Imports:

In [1]:
import pandas as pd
from wikimapper import WikiMapper
import re
import csv

### Load CMU Movie Data:

In [2]:
meta_data_path = '../data/MovieSummaries/movie.metadata.tsv'
column_names = ['Wikipedia_movie_ID', 'Freebase_movie_ID', 'movie_name', 'movie_year', 'movie_revenue', 'movie_runtime',
                'movie_languages', 'movie_countries', 'movie_genres']

In [3]:
meta_df = pd.read_csv(meta_data_path, sep='\t', header=None, names=column_names)

### Add Wikidata IDs to the CMU Movie Data:

In [4]:
wiki_ids = meta_df['Wikipedia_movie_ID'].unique().tolist()
mapper = WikiMapper("index_enwiki-20190420.db")

In [5]:
def convert_wikipedia_id_to_wikidata_id(id_):
    wikidata_id = mapper.wikipedia_id_to_id(id_)
    return wikidata_id

meta_df['wikidata_id'] = meta_df['Wikipedia_movie_ID'].apply(convert_wikipedia_id_to_wikidata_id)

### Add plot summaries to the CMU Movie Data:

In [7]:
movie_data_path = '../data/MovieSummaries'

def convert_txt_to_csv(input_file_path, output_file_path):
    """
    Converts a text file to a CSV file with two columns: movie_id and movie_summary.
    The input file should have each line with movie_id and movie_summary separated by a tab or space.
    
    Parameters:
    - input_file_path: Path to the input text file.
    - output_file_path: Path where the output CSV file will be saved.
    """
    # Open the input and output files
    with open(input_file_path, 'r', encoding='utf-8') as fin, \
            open(output_file_path, 'w', newline='', encoding='utf-8') as fout:

        # Initialize CSV writer
        writer = csv.writer(fout)

        # Write the header
        writer.writerow(['Wikipedia_movie_ID', 'movie_summary'])

        # Process each line in the input file
        for line_number, line in enumerate(fin, start=1):
            # Strip leading/trailing whitespace
            line = line.strip()

            # Skip empty lines
            if not line:
                print(f"Skipping empty line at line number {line_number}.")
                continue

            # Use regex to split on the first occurrence of tab or space
            # This ensures that the summary can contain spaces or tabs
            split_result = re.split(r'\t| ', line, maxsplit=1)

            if len(split_result) == 2:
                movie_id, movie_summary = split_result
            elif len(split_result) == 1:
                # Only movie_id is present, no summary
                movie_id = split_result[0]
                movie_summary = ''
                print(f"No summary found for movie_id '{movie_id}' at line number {line_number}.")
            else:
                # Unexpected format
                movie_id = ''
                movie_summary = ''
                print(f"Unexpected format at line number {line_number}: '{line}'")

            # Write the row to CSV
            writer.writerow([movie_id, movie_summary])


convert_txt_to_csv(f'{movie_data_path}/plot_summaries.txt', f'{movie_data_path}/plot_summaries.csv')