In [1]:
# mutliple outputs in cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# cell width
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

# XML Processing Pipeline

The plot summaries, run through the Stanford CoreNLP pipeline (tagging, parsing, NER and coref). Each filename begins with the Wikipedia movie ID (which indexes into movie.metadata.tsv).

[Paper](https://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf)

[Dependency glossary](https://downloads.cs.stanford.edu/nlp/software/dependencies_manual.pdf)

### Imports and data paths

In [2]:
import pandas as pd
import gzip
import os
import xml.etree.ElementTree as ET


PATH_IN = './XML_Dataset/'

xml_gz_files = [f for f in os.listdir(PATH_IN) if f.endswith('.xml.gz')]
len(xml_gz_files)
xml_gz_files[0:5]

42306

['10000053.xml.gz',
 '10002175.xml.gz',
 '10002779.xml.gz',
 '10003264.xml.gz',
 '10004055.xml.gz']

### XML file structure
```
sentences
│ sentence id
│ │ tokens
│ │ │ token id
│ │ │ │ word
│ │ │ │ lemma
│ │ │ │ char offset begin
│ │ │ │ char offset end
│ │ │ │ POS
│ │ │ │ NER
│ │ parse
│ │ basic-dependencies
│ │ │ dep
│ │ │ │ governor
│ │ │ │ dependent
│ │ collapsed-dependencies
│ │ │ dep
│ │ │ │ governor
│ │ │ │ dependent
│ │ collapsed-ccprocessed-dependencies
│ │ │ dep
│ │ │ │ governor
│ │ │ │ dependent
```

We will create three dataframes:
- tokens: for the token data
- parse: for the parse data
- dependecies: for the dependencies data

### Parsing data

In [3]:
# comment to parse the 42k files
# uncomment to only parse the first file, for dev purposes
# xml_gz_files = [xml_gz_files[0]]

In [None]:
parses_data = []
tokens_data = []
dependencies_data = []

for file_name in xml_gz_files:
    movie_id = file_name.replace('.xml.gz', '')
    file_path = os.path.join(PATH_IN, file_name)
    
    with gzip.open(file_path, 'rb') as f:
        xml_data = f.read()
        root = ET.fromstring(xml_data)

        for sentence in root.findall('.//sentence'):
            sentence_id = sentence.get('id')
            if sentence_id is not None:
                
                # appending to df_parses
                parse = sentence.find('parse').text if sentence.find('parse') is not None else 'N/A'
                parses_data.append({"movie_id": movie_id, "sentence_id": sentence_id, "parse": parse})

                # appending to df_tokens
                for token in sentence.findall('.//tokens/token'):
                    token_id = token.get('id')
                    word = token.find('word').text
                    lemma = token.find('lemma').text
                    char_offset_begin = token.find('CharacterOffsetBegin').text
                    char_offset_end = token.find('CharacterOffsetEnd').text
                    pos = token.find('POS').text
                    ner = token.find('NER').text
                    
                    tokens_data.append({
                        "movie_id": movie_id,
                        "sentence_id": sentence_id,
                        "token_id": token_id,
                        "word": word,
                        "lemma": lemma,
                        "COB": char_offset_begin,
                        "COE": char_offset_end,
                        "POS": pos,
                        "NER": ner,
                    })

                # appending to df_dependencies
                for dep_class, dep_xpath in [("basic", "basic-dependencies"),
                                             ("collapsed", "collapsed-dependencies"),
                                             ("collapsed-ccprocessed", "collapsed-ccprocessed-dependencies")]:
                    for dep in sentence.findall(f'.//{dep_xpath}/dep'):
                        dep_type = dep.get('type')
                        governor_idx = dep.find('governor').get('idx')
                        governor_text = dep.find('governor').text
                        dependent_idx = dep.find('dependent').get('idx')
                        dependent_text = dep.find('dependent').text
                        dependencies_data.append({
                            "movie_id": movie_id,
                            "sentence_id": sentence_id,
                            "dependency_class": dep_class,
                            "dependency_type": dep_type,
                            "governor_id": governor_idx,
                            "governor_word": governor_text,
                            "dependent_id": dependent_idx,
                            "dependent_word": dependent_text,
                        })

tokens_df = pd.DataFrame(tokens_data)
dependencies_df = pd.DataFrame(dependencies_data)
parses_df = pd.DataFrame(parses_data)

In [None]:
tokens_df["movie_id"] = tokens_df["movie_id"].astype("int64")
tokens_df["sentence_id"] = tokens_df["sentence_id"].astype("int64")
tokens_df["token_id"] = tokens_df["token_id"].astype("int64")
tokens_df["COB"] = tokens_df["COB"].astype("int64")
tokens_df["COE"] = tokens_df["COE"].astype("int64")
tokens_df["word"] = tokens_df["word"].astype("string")
tokens_df["lemma"] = tokens_df["lemma"].astype("string")
tokens_df["POS"] = tokens_df["POS"].astype("string")
tokens_df["NER"] = tokens_df["NER"].astype("string")
tokens_df
tokens_df.dtypes

In [None]:
dependencies_df["movie_id"] = dependencies_df["movie_id"].astype("int64")
dependencies_df["sentence_id"] = dependencies_df["sentence_id"].astype("int64")
dependencies_df["governor_id"] = dependencies_df["governor_id"].astype("int64")
dependencies_df["dependent_id"] = dependencies_df["dependent_id"].astype("int64")
dependencies_df["dependency_class"] = dependencies_df["dependency_class"].astype("string")
dependencies_df["dependency_type"] = dependencies_df["dependency_type"].astype("string")
dependencies_df["governor_word"] = dependencies_df["governor_word"].astype("string")
dependencies_df["dependent_word"] = dependencies_df["dependent_word"].astype("string")
dependencies_df
dependencies_df.dtypes

In [None]:
parses_df["movie_id"] = parses_df["movie_id"].astype("int64")
parses_df["sentence_id"] = parses_df["sentence_id"].astype("int64")
parses_df["parse"] = parses_df["parse"].astype("string")
parses_df
parses_df.dtypes

In [None]:
tokens_df.to_csv('tokens.csv', index=False)
dependencies_df.to_csv('dependencies.csv', index=False)
parses_df.to_csv('parses.csv', index=False)

### Old printing code (will be removed in next commit)

In [None]:
for file_name in xml_gz_files:
    print(file_name)
    file_path = os.path.join(PATH_IN, file_name)
    
    with gzip.open(file_path, 'rb') as f:
        xml_data = f.read()
        
        root = ET.fromstring(xml_data)

        for sentence in root.findall('.//sentence'):
            sentence_id = sentence.get('id')
            if sentence_id is not None:
                print(f"Sentence ID: {sentence_id}")

                # Process each token in the sentence
                for token in sentence.findall('.//tokens/token'):
                    token_id = token.get('id')
                    word = token.find('word').text
                    lemma = token.find('lemma').text
                    char_offset_begin = token.find('CharacterOffsetBegin').text
                    char_offset_end = token.find('CharacterOffsetEnd').text
                    pos = token.find('POS').text
                    ner = token.find('NER').text

                    print(f"Token ID: {token_id}, Word: {word}, Lemma: {lemma}, COB: {char_offset_begin}, COE_ {char_offset_end} POS: {pos}, NER: {ner}")

                # Extract parse information for each sentence
                parse = sentence.find('parse').text if sentence.find('parse') is not None else 'N/A'
                print(f"Parse: {parse}")

                # Process dependencies for each sentence
                basic_deps = sentence.find('basic-dependencies')
                if basic_deps is not None:
                    for dep in basic_deps.findall('dep'):
                        dep_type = dep.get('type')
                        governor_idx = dep.find('governor').get('idx')
                        governor_text = dep.find('governor').text
                        dependent_idx = dep.find('dependent').get('idx')
                        dependent_text = dep.find('dependent').text
                        print(f"Basic Dependency Type: {dep_type}, Governor idx={governor_idx}: {governor_text}, Dependent idx={dependent_idx}: {dependent_text}")

                # Process collapsed dependencies
                collapsed_deps = sentence.find('collapsed-dependencies')
                if collapsed_deps is not None:
                    for dep in collapsed_deps.findall('dep'):
                        dep_type = dep.get('type')
                        governor_idx = dep.find('governor').get('idx')
                        governor_text = dep.find('governor').text
                        dependent_idx = dep.find('dependent').get('idx')
                        dependent_text = dep.find('dependent').text
                        print(f"Collapsed Dependency Type: {dep_type}, Governor idx={governor_idx}: {governor_text}, Dependent idx={dependent_idx}: {dependent_text}")

                # Process collapsed-ccprocessed dependencies
                ccprocessed_deps = sentence.find('collapsed-ccprocessed-dependencies')
                if ccprocessed_deps is not None:
                    for dep in ccprocessed_deps.findall('dep'):
                        dep_type = dep.get('type')
                        governor_idx = dep.find('governor').get('idx')
                        governor_text = dep.find('governor').text
                        dependent_idx = dep.find('dependent').get('idx')
                        dependent_text = dep.find('dependent').text
                        print(f"Collapsed-CCProcessed Dependency Type: {dep_type}, Governor idx={governor_idx}: {governor_text}, Dependent idx={dependent_idx}: {dependent_text}")

                print("\n")  # Print a newline to separate sentences

# Thinking on how to save the data for all the files
TODO:
- Finish XML for all the files
- check how to use wiki and freebase id with wikidata
- check for a scrapper on wikidata + IMDb or other
- Create a notebook with all the saves, probably move the single file notebook to an exploration branch
- create a drawboard with all the files and their columns to clearly see the merges