# NLP analysis of plot summaries
Import packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import sys
print(sys.version)

3.11.2 (tags/v3.11.2:878ead1, Feb  7 2023, 16:38:35) [MSC v.1934 64 bit (AMD64)]


Load data

In [3]:
# Define data path
DATA_PATH = 'clean_data/'

# Load summaries
summaries_df = pd.read_csv(DATA_PATH + 'movies_summaries.csv')

# Display dataframe
display(summaries_df)

Unnamed: 0,movie_id,summary
0,/m/076w2lb,"Shlykov, a hard-working taxi driver and Lyosha..."
1,/m/0gkz15s,The nation of Panem consists of a wealthy Capi...
2,/m/051zjwb,Poovalli Induchoodan is sentenced for six yea...
3,/m/06xtz3,"The Lemon Drop Kid , a New York City swindler,..."
4,/m/02tqm5,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42199,/m/0j3dcl6,"The story is about Reema , a young Muslim scho..."
42200,/m/045pct,"In 1928 Hollywood, director Leo Andreyev look..."
42201,/m/0j6777g,American Luthier focuses on Randy Parsons’ tra...
42202,/m/04f7jfs,"Abdur Rehman Khan , a middle-aged dry fruit se..."


Extract and load NLP processed summaries

In [6]:
import os
import gzip
import shutil

In [7]:
# Path to the zipped folders
NLP_DATA_PATH = 'data/corenlp_plot_summaries/'

# Path to the new data folder
NLP_SUMMARIES = 'clean_data/NLP_summaries/'

# Create the new data folder if it doesn't exist
if not os.path.exists(NLP_SUMMARIES):
    os.makedirs(NLP_SUMMARIES)

# Iterate through each gzipped folder
for root, dirs, files in os.walk(NLP_DATA_PATH):
    for file in files:
        # Check if the file is a gzipped file
        if file.endswith('.gz'):
            # Path to the gzipped file contained in the folder
            gz_path = os.path.join(root, file)

            # Create a folder name based on the gzipped file name, removing the extension
            folder_name = os.path.splitext(file)[0]

            # Path to the destination folder
            destination_folder = os.path.join(NLP_SUMMARIES, folder_name)

            # Unzip the contents of the gzipped file to the destination folder
            with gzip.open(gz_path, 'rb') as gz_file:
                with open(destination_folder, 'wb') as out_file:
                    shutil.copyfileobj(gz_file, out_file)

# Move all XML files from the unzipped folders to the clean data folder
for root, dirs, files in os.walk(NLP_SUMMARIES):
    for file in files:
        # Check if the file is an XML file
        if file.endswith('.xml'):
            # Path to the XML file
            xml_path = os.path.join(root, file)

            # Move the XML file
            shutil.move(xml_path, os.path.join(NLP_SUMMARIES, file))

# Remove the empty folders left after moving the XML files
for root, dirs, files in os.walk(NLP_SUMMARIES, topdown=False):
    for folder in dirs:
        folder_path = os.path.join(root, folder)
        os.rmdir(folder_path)

### Read and parse one XML summary for exemple

In [8]:
import xml.etree.ElementTree as ET

In [14]:
# Specify the path to your XML file
movie_id = 3217

# Construct the full path to the XML file
summary_test_file = NLP_SUMMARIES + str(movie_id) + '.xml'
print(summary_test_file)

clean_data/NLP_summaries/3217.xml


Each `<token>` element represents a word from a sentence, identified by a unique `id`. Inside each `<token>` element, there are several child elements that provide more information about the token:

- `<word>`: The actual word in the text.
- `<lemma>`: The base or dictionary form of the word.
- `<CharacterOffsetBegin>` and `<CharacterOffsetEnd>`: The start and end positions of the word in the original text.
- `<POS>`: The part-of-speech tag for the word. POS tagging is the task of labeling the words in a sentence with their appropriate part of speech (noun, verb, adjective, etc.).
- `<NER>`: Named Entity Recognition tag. NER is a subtask of information extraction that seeks to locate and classify named entities in text into predefined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.

The `<sentence>` element contains a set of these tokens, representing a sentence in the text. The `id` attribute of the `<sentence>` element indicates the order of the sentence in the document.

In [24]:
# Parse the XML file and get the root element
tree = ET.parse(summary_test_file)
root = tree.getroot()

# Create an empty dict to store data
tokens_dict = {}

# Iterate over all 'token' elements in the document
for token in root.iter('token'):
    # Extract data from the child elements of 'token'
    word = token.find('word').text
    lemma = token.find('lemma').text
    pos = token.find('POS').text
    ner = token.find('NER').text
    
    # Store in dict
    token_id = token.attrib['id']
    tokens_dict[token_id] = {
        'word': word,
        'lemma': lemma,
        'POS': pos,
        'NER': ner,
    }

    # Print the extracted data
    #print(f'Word: {word}, Lemma: {lemma}, POS: {pos}, NER: {ner}')

# Print the dict
print(tokens_dict)

{'1': {'word': 'A', 'lemma': 'a', 'POS': 'DT', 'NER': 'O'}, '2': {'word': 'female', 'lemma': 'female', 'POS': 'JJ', 'NER': 'O'}, '3': {'word': 'customer', 'lemma': 'customer', 'POS': 'NN', 'NER': 'O'}, '4': {'word': 'becomes', 'lemma': 'become', 'POS': 'VBZ', 'NER': 'O'}, '5': {'word': 'possessed', 'lemma': 'possess', 'POS': 'VBN', 'NER': 'O'}, '6': {'word': 'by', 'lemma': 'by', 'POS': 'IN', 'NER': 'O'}, '7': {'word': 'a', 'lemma': 'a', 'POS': 'DT', 'NER': 'O'}, '8': {'word': 'demon', 'lemma': 'demon', 'POS': 'NN', 'NER': 'O'}, '9': {'word': 'and', 'lemma': 'and', 'POS': 'CC', 'NER': 'O'}, '10': {'word': 'starts', 'lemma': 'start', 'POS': 'VBZ', 'NER': 'O'}, '11': {'word': 'wreaking', 'lemma': 'wreak', 'POS': 'VBG', 'NER': 'O'}, '12': {'word': 'havoc', 'lemma': 'havoc', 'POS': 'NN', 'NER': 'O'}, '13': {'word': 'on', 'lemma': 'on', 'POS': 'IN', 'NER': 'O'}, '14': {'word': 'the', 'lemma': 'the', 'POS': 'DT', 'NER': 'O'}, '15': {'word': 'store', 'lemma': 'store', 'POS': 'NN', 'NER': 'O'},

Each sentence also has dependency informations between tokens:

- `<basic-dependencies>`: These represent grammatical relationships between words in a sentence. For example, the dependencies between noun and verb.

- `<collapsed-dependencies>`: These are a simplified form of dependencies where certain types of indirect dependencies are collapsed into direct dependencies for easier processing.

In [25]:
# Create an empty dict to store data
dep_dict = {}

# Extract and print dependencies
for i,dep in enumerate(root.iter('dep')):
    # Extract data
    dep_type = dep.attrib['type']
    governor = dep.find('governor').text
    dependent = dep.find('dependent').text

    # Store in dict
    dep_dict[i] = {
        'type': dep_type,
        'governor': governor,
        'dependent': dependent
    }
    
    # Print the extracted data
    # print(f"Dependency type: {dep_type}, Governor: {governor}, Dependent: {dependent}")

# Print the dict
print(dep_dict)

{0: {'type': 'prep', 'governor': 'is', 'dependent': 'After'}, 1: {'type': 'auxpass', 'governor': 'pulled', 'dependent': 'being'}, 2: {'type': 'pcomp', 'governor': 'After', 'dependent': 'pulled'}, 3: {'type': 'prep', 'governor': 'pulled', 'dependent': 'through'}, 4: {'type': 'det', 'governor': 'portal', 'dependent': 'a'}, 5: {'type': 'nn', 'governor': 'portal', 'dependent': 'time'}, 6: {'type': 'pobj', 'governor': 'through', 'dependent': 'portal'}, 7: {'type': 'nn', 'governor': 'lands', 'dependent': 'Ash'}, 8: {'type': 'nn', 'governor': 'lands', 'dependent': 'Williams'}, 9: {'type': 'nsubj', 'governor': 'is', 'dependent': 'lands'}, 10: {'type': 'prep', 'governor': 'lands', 'dependent': 'in'}, 11: {'type': 'num', 'governor': 'AD', 'dependent': '1300'}, 12: {'type': 'pobj', 'governor': 'in', 'dependent': 'AD'}, 13: {'type': 'advmod', 'governor': 'captured', 'dependent': 'where'}, 14: {'type': 'nsubjpass', 'governor': 'captured', 'dependent': 'he'}, 15: {'type': 'auxpass', 'governor': 'cap

Finaly, dependencies between different sentences can be analyzed:

- `<coreferences>`: These represent instances where multiple expressions in a text refer to the same entity. 

In the XML, each `<coreference>` element represents a group of mentions that all refer to the same entity in the text. Each `<mention>` element within a `<coreference>` represents a specific instance where that entity is mentioned in the text.
The `<mention representative="true">` is the primary mention in the text that other mentions refer back to. 

The `<sentence>` tag within a `<mention>` indicates the sentence number where the mention occurs. The `<start>` and `<end>` tags indicate the position of the start and end of the mention within that sentence. The `<head>` tag indicates the head word of the mention.

To know what a coreference corresponds to in the actual text, we would need to find the sentences and word positions indicated by the `<sentence>`, `<start>`, and `<end>` tags in the text itself.

In [26]:
# Create an empty dict to store data
coref_dict = {}

# Extract and print coreferences
for i,coref in enumerate(root.iter('coreference')):
    coref_dict[i] = {}
    for j,mention in enumerate(coref.iter('mention')):
        # Extract data
        sentence = mention.find('sentence').text
        start = mention.find('start').text
        end = mention.find('end').text

        # Store in dict
        coref_dict[i][j] = {
            'sentence': sentence,
            'start': start,
            'end': end
        }

        # Print the extracted data
        #print(f"Sentence: {sentence}, Start: {start}, End: {end}")

# Print the dict
print(coref_dict)

{0: {0: {'sentence': '1', 'start': '23', 'end': '26'}, 1: {'sentence': '3', 'start': '18', 'end': '20'}, 2: {'sentence': '2', 'start': '6', 'end': '15'}, 3: {'sentence': '1', 'start': '36', 'end': '38'}, 4: {'sentence': '2', 'start': '6', 'end': '9'}, 5: {'sentence': '2', 'start': '8', 'end': '9'}, 6: {'sentence': '4', 'start': '3', 'end': '4'}, 7: {'sentence': '4', 'start': '5', 'end': '6'}, 8: {'sentence': '5', 'start': '1', 'end': '2'}, 9: {'sentence': '21', 'start': '21', 'end': '23'}, 10: {'sentence': '16', 'start': '17', 'end': '19'}, 11: {'sentence': '21', 'start': '34', 'end': '35'}, 12: {'sentence': '15', 'start': '14', 'end': '23'}, 13: {'sentence': '15', 'start': '14', 'end': '16'}, 14: {'sentence': '1', 'start': '13', 'end': '42'}, 15: {'sentence': '1', 'start': '13', 'end': '15'}, 16: {'sentence': '1', 'start': '33', 'end': '42'}, 17: {'sentence': '1', 'start': '17', 'end': '18'}, 18: {'sentence': '1', 'start': '30', 'end': '31'}, 19: {'sentence': '2', 'start': '1', 'end':

We finally get for each summary:
- A `token dictionnary` containing the word, lemma, POS and NER of each tokens (words) of the summary.
- A `dependency dictionnary` containing each gramatical dependencies found in the summary along with the dependency type.
- A `coreference dictionnary` referencing for each coreference (entity refered to) all the group of words related to it in a specific sentence.