# Part 1 : Show the content of a XML file

In [1]:
import pandas as pd
import gzip

file_path = '../data/RawDatasets/corenlp_plot_summaries/1031545.xml.gz'


with gzip.open(file_path, 'rt', encoding='utf-8') as file:
    content = file.readlines()

for line in content[:30]:  # Adjust the number to see more or fewer lines. Now it's 0~30
    print(line.strip())


<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?>
<root>
<document>
<sentences>
<sentence id="1">
<tokens>
<token id="1">
<word>The</word>
<lemma>the</lemma>
<CharacterOffsetBegin>0</CharacterOffsetBegin>
<CharacterOffsetEnd>3</CharacterOffsetEnd>
<POS>DT</POS>
<NER>O</NER>
</token>
<token id="2">
<word>basic</word>
<lemma>basic</lemma>
<CharacterOffsetBegin>4</CharacterOffsetBegin>
<CharacterOffsetEnd>9</CharacterOffsetEnd>
<POS>JJ</POS>
<NER>O</NER>
</token>
<token id="3">
<word>plot</word>
<lemma>plot</lemma>
<CharacterOffsetBegin>10</CharacterOffsetBegin>
<CharacterOffsetEnd>14</CharacterOffsetEnd>
<POS>NN</POS>
<NER>O</NER>


# Part 2 : Show the sentence of each summary

I have uploaded 61 files out of 27701(?) files.
First 3 files' content is printed.

In [2]:
import os
import gzip
import xml.etree.ElementTree as ET
import re

# Function to split a filename into numeric and non-numeric parts for natural sorting
def natural_key(filename):
    # Split the filename into a list of numeric and non-numeric parts
    return [int(part) if part.isdigit() else part for part in re.split(r'(\d+)', filename)]

# Specify the directory containing your XML files
directory = '../data/RawDatasets/corenlp_plot_summaries'

# Get all the .xml.gz files in the directory and sort them using the natural key
files = [f for f in os.listdir(directory) if f.endswith('.xml.gz')]
sorted_files = sorted(files, key=natural_key)

# Initialize counters
total_files = len(sorted_files)
processed_files = 0

for filename in sorted_files[:3]:
    print(f'Processing {filename}...')
    file_path = os.path.join(directory, filename)
    
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        # Parse the XML content
        try:
            tree = ET.parse(file)
            root = tree.getroot()
            sentences = root.find('.//sentences')
            
            if sentences is not None:
                # Increment processed_files only once for the file
                processed_files += 1  
                
                for sentence in sentences.findall('sentence'):
                    tokens = sentence.find('tokens')
                    if tokens is not None:
                        # Extract the sentence text with proper spacing
                        sentence_text = ' '.join(token.find('word').text for token in tokens.findall('token'))
                        print(f'Extracted sentence: {sentence_text}')
                    else:
                        print('No tokens found in this sentence.')
            else:
                print('No sentences found in the file.')
                
        except ET.ParseError:
            print(f'Error parsing {filename}')

# Final output of processed file count
print(f'Total files processed: {processed_files} out of {total_files}')


Processing 1031545.xml.gz...
Extracted sentence: The basic plot concept bears a strong similarity to the earlier movie Red Sun , also featuring Toshiro Mifune .
Processing 1495388.xml.gz...
Extracted sentence: \* Winnie the Pooh and the Honey Tree \* Winnie the Pooh and the Blustery Day \* Winnie the Pooh and Tigger Too
Processing 2372198.xml.gz...
Extracted sentence: After the death of his outlaw brother , Jesse , Frank James seeks revenge on his killers , Bob and Charlie Ford .
Total files processed: 3 out of 61


# Part 3 : Relation to Movie Metadata

Verified that the file name is the movie id in metadata.

In [3]:

def load_movie_metadata(filepath):
    # Load the metadata with appropriate column names
    column_names = ['movieId', 'uniqueId', 'title', 'releaseDate', 'duration', 'rating', 'languages', 'countries', 'genres']
    metadata_df = pd.read_csv(filepath, sep='\t', header=None, names=column_names)
    return metadata_df

In [4]:
def check_movie_ids_in_filenames(metadata_df, xml_folder):
    """
    Check if XML filenames correspond to movie IDs in the metadata.
    
    Parameters:
    - metadata_df (DataFrame): DataFrame containing movie metadata.
    - xml_folder (str): Path to the folder containing XML files.
    
    Returns:
    - matches (list): A list of filenames that match movie IDs.
    """
    movie_ids = metadata_df['movieId'].astype(str).tolist()
    matches = []

    # Iterate over XML files in the specified directory
    for filename in os.listdir(xml_folder):
        if filename.endswith('.xml.gz'):  # Filter for XML files
            movie_id = filename.split('.')[0]  # Extract the movie ID from the filename
            if movie_id in movie_ids:
                matches.append((movie_id, filename))

    return matches


In [5]:

def load_movie_metadata(filepath):
    # Load the metadata with appropriate column names
    column_names = ['movieId', 'uniqueId', 'title', 'releaseDate', 'duration', 'rating', 'languages', 'countries', 'genres']
    metadata_df = pd.read_csv(filepath, sep='\t', header=None, names=column_names)
    return metadata_df


In [6]:
# Load movie metadata
movie_meta_filepath = '../data/RawDatasets/MovieSummaries/movie.metadata.tsv'
metadata_df = load_movie_metadata(movie_meta_filepath)

# Specify the path to your XML files
xml_folder_path = '../data/RawDatasets/corenlp_plot_summaries/'

# Check for matches
matching_files = check_movie_ids_in_filenames(metadata_df, xml_folder_path)

# Print matches
for movie_id, filename in matching_files[:10]:
    print(f"Match found: Movie ID {movie_id} corresponds to file {filename}")


Match found: Movie ID 2740181 corresponds to file 2740181.xml.gz
Match found: Movie ID 1031545 corresponds to file 1031545.xml.gz
Match found: Movie ID 9204672 corresponds to file 9204672.xml.gz
Match found: Movie ID 19816863 corresponds to file 19816863.xml.gz
Match found: Movie ID 17061845 corresponds to file 17061845.xml.gz
Match found: Movie ID 10396322 corresponds to file 10396322.xml.gz
Match found: Movie ID 17906343 corresponds to file 17906343.xml.gz
Match found: Movie ID 8906897 corresponds to file 8906897.xml.gz
Match found: Movie ID 14404921 corresponds to file 14404921.xml.gz
Match found: Movie ID 5606938 corresponds to file 5606938.xml.gz
