# XML Processing Pipeline (for the XML files provided by the authors)

The plot summaries, run through the Stanford CoreNLP pipeline (tagging, parsing, NER and coref). Each filename begins with the Wikipedia movie ID (which indexes into movie.metadata.tsv).

[Paper](https://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf)

[Dependency glossary](https://downloads.cs.stanford.edu/nlp/software/dependencies_manual.pdf)

### Imports and data paths
Download the XML data from [here](https://www.cs.cmu.edu/~ark/personas/data/corenlp_plot_summaries.tar) and place them inside a folder named `XML_data`.

```
project_root/
│
├── P2.ipynb
│
├── XML_data/
├── 330.xml.gz
    ├── 3271.xml.gz
    ... (many more files) ...
    └── 37501922.xml.gz
```

In [2]:
import pandas as pd
import gzip
import os
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm

PATH_IN = '../XML_data/'

xml_gz_files = [f for f in os.listdir(PATH_IN) if f.endswith('.xml.gz')]
len(xml_gz_files)
xml_gz_files[0:5]

42306

['10000053.xml.gz',
 '10002175.xml.gz',
 '10002779.xml.gz',
 '10003264.xml.gz',
 '10004055.xml.gz']

### XML file structure
```
root
│ document
│ │ sentences
│ │ │ sentence id
│ │ │ │ tokens
│ │ │ │ │ token id
│ │ │ │ │ │ word
│ │ │ │ │ │ lemma
│ │ │ │ │ │ char offset begin
│ │ │ │ │ │ char offset end
│ │ │ │ │ │ POS
│ │ │ │ │ │ NER
│ │ │ │ parse
│ │ │ │ basic-dependencies
│ │ │ │ │ dep
│ │ │ │ │ │ governor
│ │ │ │ │ │ dependent
│ │ │ │ collapsed-dependencies
│ │ │ │ │ dep
│ │ │ │ │ │ governor
│ │ │ │ │ │ dependent
│ │ │ │ collapsed-ccprocessed-dependencies
│ │ │ │ │ dep
│ │ │ │ │ │ governor
│ │ │ │ │ │ dependent
│ │ coreference
│ │ │ coreference
│ │ │ │ mention
│ │ │ │ │ sentence
│ │ │ │ │ start
│ │ │ │ │ end
│ │ │ │ │ head
```

We will create four dataframes:
- tokens: for the token data
- parse: for the parse data
- dependecies: for the dependencies data
- coref: for the coreference data

### Extracting data

In [4]:
parses_data = []
tokens_data = []
dependencies_data = []
coref_data = []

for file_name in tqdm(xml_gz_files, desc='Processing xml.gz files'):
    movie_id = file_name.replace('.xml.gz', '')
    file_path = os.path.join(PATH_IN, file_name)
    
    with gzip.open(file_path, 'rb') as f:
        xml_data = f.read()
        root = ET.fromstring(xml_data)

        for sentence in root.findall('.//sentence'):
            sentence_id = sentence.get('id')
            if sentence_id is not None:
                
                # appending to df_parses
                parse = sentence.find('parse').text if sentence.find('parse') is not None else 'N/A'
                parses_data.append({"movie_id": movie_id, "sentence_id": sentence_id, "parse": parse})

                # appending to df_tokens
                for token in sentence.findall('.//tokens/token'):
                    token_id = token.get('id')
                    word = token.find('word').text
                    lemma = token.find('lemma').text
                    char_offset_begin = token.find('CharacterOffsetBegin').text
                    char_offset_end = token.find('CharacterOffsetEnd').text
                    pos = token.find('POS').text
                    ner = token.find('NER').text
                    
                    tokens_data.append({
                        "movie_id": movie_id,
                        "sentence_id": sentence_id,
                        "token_id": token_id,
                        "word": word,
                        "lemma": lemma,
                        "COB": char_offset_begin,
                        "COE": char_offset_end,
                        "POS": pos,
                        "NER": ner,
                    })

                # appending to df_dependencies
                for dep_class, dep_xpath in [("basic", "basic-dependencies"),
                                             ("collapsed", "collapsed-dependencies"),
                                             ("collapsed-ccprocessed", "collapsed-ccprocessed-dependencies")]:
                    for dep in sentence.findall(f'.//{dep_xpath}/dep'):
                        dep_type = dep.get('type')
                        governor_idx = dep.find('governor').get('idx')
                        governor_text = dep.find('governor').text
                        dependent_idx = dep.find('dependent').get('idx')
                        dependent_text = dep.find('dependent').text
                        dependencies_data.append({
                            "movie_id": movie_id,
                            "sentence_id": sentence_id,
                            "dependency_class": dep_class,
                            "dependency_type": dep_type,
                            "governor_id": governor_idx,
                            "governor_word": governor_text,
                            "dependent_id": dependent_idx,
                            "dependent_word": dependent_text,
                        })
                        
        for coref in root.findall('.//coreference/coreference'):
            for mention in coref.findall('mention'):
                representative = mention.get('representative', 'false') == 'true' 
                sentence_id = mention.find('sentence').text
                start = mention.find('start').text
                end = mention.find('end').text
                head = mention.find('head').text

                coref_data.append({
                    "movie_id": movie_id,
                    "sentence_id": sentence_id,
                    "start": start,
                    "end": end,
                    "head": head,
                    "representative": representative
                })

tokens_df = pd.DataFrame(tokens_data)
dependencies_df = pd.DataFrame(dependencies_data)
parses_df = pd.DataFrame(parses_data)
coref_df = pd.DataFrame(coref_data)

Processing xml.gz files:   0%|          | 0/42306 [00:00<?, ?it/s]

tqdm not rendered on Github: total computational time ~1h50 (appending get's more and more costly as we advance through the files)

In [5]:
tokens_df["movie_id"] = pd.to_numeric(tokens_df["movie_id"], downcast='integer')
tokens_df["sentence_id"] = pd.to_numeric(tokens_df["sentence_id"], downcast='integer')
tokens_df["token_id"] = pd.to_numeric(tokens_df["token_id"], downcast='integer')
tokens_df["COB"] = pd.to_numeric(tokens_df["COB"], downcast='integer')
tokens_df["COE"] = pd.to_numeric(tokens_df["COE"], downcast='integer')
tokens_df["word"] = tokens_df["word"].astype("string")
tokens_df["lemma"] = tokens_df["lemma"].astype("string")
tokens_df["POS"] = tokens_df["POS"].astype("string")
tokens_df["NER"] = tokens_df["NER"].astype("string")
tokens_df
tokens_df.dtypes

Unnamed: 0,movie_id,sentence_id,token_id,word,lemma,COB,COE,POS,NER
0,10000053,1,1,Fur,Fur,0,3,NNP,O
1,10000053,1,2,trapper,trapper,4,11,NNP,O
2,10000053,1,3,Jean,Jean,12,16,NNP,PERSON
3,10000053,1,4,La,La,17,19,NNP,PERSON
4,10000053,1,5,B,B,20,21,NNP,PERSON
...,...,...,...,...,...,...,...,...,...
14905198,9999280,6,14,face,face,500,504,NN,O
14905199,9999280,6,15,reality,reality,505,512,NN,O
14905200,9999280,6,16,more,more,513,517,RBR,O
14905201,9999280,6,17,clearly,clearly,518,525,RB,O


movie_id        int32
sentence_id     int16
token_id        int16
word           string
lemma          string
COB             int16
COE             int16
POS            string
NER            string
dtype: object

In [6]:
dependencies_df["movie_id"] = pd.to_numeric(dependencies_df["movie_id"], downcast='integer')
dependencies_df["sentence_id"] = pd.to_numeric(dependencies_df["sentence_id"], downcast='integer')
dependencies_df["governor_id"] = pd.to_numeric(dependencies_df["governor_id"], downcast='integer')
dependencies_df["dependent_id"] = pd.to_numeric(dependencies_df["dependent_id"], downcast='integer')
dependencies_df["dependency_class"] = dependencies_df["dependency_class"].astype("string")
dependencies_df["dependency_type"] = dependencies_df["dependency_type"].astype("string")
dependencies_df["governor_word"] = dependencies_df["governor_word"].astype("string")
dependencies_df["dependent_word"] = dependencies_df["dependent_word"].astype("string")
dependencies_df
dependencies_df.dtypes

Unnamed: 0,movie_id,sentence_id,dependency_class,dependency_type,governor_id,governor_word,dependent_id,dependent_word
0,10000053,1,basic,nn,6,te,1,Fur
1,10000053,1,basic,nn,6,te,2,trapper
2,10000053,1,basic,nn,6,te,3,Jean
3,10000053,1,basic,nn,6,te,4,La
4,10000053,1,basic,nn,6,te,5,B
...,...,...,...,...,...,...,...,...
34199063,9999280,6,collapsed-ccprocessed,nn,15,reality,13,Marcelo
34199064,9999280,6,collapsed-ccprocessed,nn,15,reality,14,face
34199065,9999280,6,collapsed-ccprocessed,nsubj,17,clearly,15,reality
34199066,9999280,6,collapsed-ccprocessed,advmod,17,clearly,16,more


movie_id             int32
sentence_id          int16
dependency_class    string
dependency_type     string
governor_id          int16
governor_word       string
dependent_id         int16
dependent_word      string
dtype: object

In [7]:
parses_df["movie_id"] = pd.to_numeric(parses_df["movie_id"], downcast='integer')
parses_df["sentence_id"] = pd.to_numeric(parses_df["sentence_id"], downcast='integer')
parses_df["parse"] = parses_df["parse"].astype("string")
parses_df
parses_df.dtypes

Unnamed: 0,movie_id,sentence_id,parse
0,10000053,1,(ROOT (S (NP (NNP Fur) (NNP trapper) (NNP Jean...
1,10000053,2,(ROOT (S (S (PP (IN At) (NP (DT the) (NN settl...
2,10000053,3,(ROOT (S (NP (DT The) (NN trader)) (VP (VBZ ex...
3,10000053,4,"(ROOT (S (ADVP (RB Later)) (, ,) (NP (DT the) ..."
4,10000053,5,(ROOT (S (S (NP (NNP Jean) (NNP La) (NNP B) (N...
...,...,...,...
665581,9999280,2,(ROOT (S (NP (PRP He)) (VP (VBZ 's) (ADJP (JJ ...
665582,9999280,3,(ROOT (S (NP (PRP He)) (VP (VP (VBZ lives) (PP...
665583,9999280,4,(ROOT (S (NP (PRP He)) (VP (VBZ devotes) (NP (...
665584,9999280,5,"(ROOT (S (S (NP (PRP She)) (, ,) (ADVP (RB how..."


movie_id        int32
sentence_id     int16
parse          string
dtype: object

In [8]:
coref_df["movie_id"] = pd.to_numeric(coref_df["movie_id"], downcast='integer')
coref_df["sentence_id"] = pd.to_numeric(coref_df["sentence_id"], downcast='integer')
coref_df["start"] = pd.to_numeric(coref_df["start"], downcast='integer')
coref_df["end"] = pd.to_numeric(coref_df["end"], downcast='integer')
coref_df["head"] = pd.to_numeric(coref_df["head"], downcast='integer')
coref_df["representative"] = coref_df["representative"].astype('bool')
coref_df
coref_df.dtypes

Unnamed: 0,movie_id,sentence_id,start,end,head,representative
0,10000053,1,3,6,5,True
1,10000053,1,8,9,8,False
2,10000053,5,1,4,3,False
3,10000053,5,8,9,8,False
4,10000053,5,16,17,16,False
...,...,...,...,...,...,...
2921137,9999280,4,8,9,8,False
2921138,9999280,5,1,2,1,False
2921139,9999280,5,16,17,16,False
2921140,9999280,4,3,9,3,True


movie_id           int32
sentence_id        int16
start              int16
end                int16
head               int16
representative      bool
dtype: object

In [9]:
PATH_OUT = '../generated/annotations_2013/'

tokens_df.to_parquet(os.path.join(PATH_OUT, "tokens.parquet"), compression= "brotli") 
dependencies_df.to_parquet(os.path.join(PATH_OUT, "dependencies.parquet"), compression= "brotli") 
parses_df.to_parquet(os.path.join(PATH_OUT, "parses.parquet"), compression= "brotli") 
coref_df.to_parquet(os.path.join(PATH_OUT, "coref.parquet"), compression= "brotli")