In [1]:
# mutliple outputs in cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# cell width
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

The plot summaries, run through the Stanford CoreNLP pipeline (tagging, parsing, NER and coref). Each filename begins with the Wikipedia movie ID (which indexes into movie.metadata.tsv).

In [2]:
import gzip
import os
import xml.etree.ElementTree as ET


PATH_IN = './XML_Dataset/'

In [3]:
xml_gz_files = [f for f in os.listdir(PATH_IN) if f.endswith('.xml.gz')]
len(xml_gz_files)
xml_gz_files[0:5]

42306

['10000053.xml.gz',
 '10002175.xml.gz',
 '10002779.xml.gz',
 '10003264.xml.gz',
 '10004055.xml.gz']

### XML file structure
```
sentences
│ sentence id
│ │ tokens
│ │ │ token id
│ │ │ │ word
│ │ │ │ lemma
│ │ │ │ char offset begin
│ │ │ │ char offset end
│ │ │ │ POS
│ │ │ │ NER
│ │ parse
│ │ basic-dependencies
│ │ │ dep
│ │ │ │ governor
│ │ │ │ dependent
│ │ collapsed-dependencies
│ │ │ dep
│ │ │ │ governor
│ │ │ │ dependent
│ │ collapsed-ccprocessed-dependencies
│ │ │ dep
│ │ │ │ governor
│ │ │ │ dependent
```

In [6]:
# first start with a single file, before looping 42k
xml_gz_files = [xml_gz_files[0]]

In [32]:
for file_name in xml_gz_files:
    print(file_name)
    file_path = os.path.join(PATH_IN, file_name)
    
    with gzip.open(file_path, 'rb') as f:
        xml_data = f.read()
        
        root = ET.fromstring(xml_data)

        for sentence in root.findall('.//sentence'):
            sentence_id = sentence.get('id')
            if sentence_id is not None:
                print(f"Sentence ID: {sentence_id}")

                # Process each token in the sentence
                for token in sentence.findall('.//tokens/token'):
                    token_id = token.get('id')
                    word = token.find('word').text
                    lemma = token.find('lemma').text
                    char_offset_begin = token.find('CharacterOffsetBegin').text
                    char_offset_end = token.find('CharacterOffsetEnd').text
                    pos = token.find('POS').text
                    ner = token.find('NER').text

                    print(f"Token ID: {token_id}, Word: {word}, Lemma: {lemma}, COB: {char_offset_begin}, COE_ {char_offset_end} POS: {pos}, NER: {ner}")

                # Extract parse information for each sentence
                parse = sentence.find('parse').text if sentence.find('parse') is not None else 'N/A'
                print(f"Parse: {parse}")

                # Process dependencies for each sentence
                basic_deps = sentence.find('basic-dependencies')
                if basic_deps is not None:
                    for dep in basic_deps.findall('dep'):
                        dep_type = dep.get('type')
                        governor_idx = dep.find('governor').get('idx')
                        governor_text = dep.find('governor').text
                        dependent_idx = dep.find('dependent').get('idx')
                        dependent_text = dep.find('dependent').text
                        print(f"Basic Dependency Type: {dep_type}, Governor idx={governor_idx}: {governor_text}, Dependent idx={dependent_idx}: {dependent_text}")

                # Process collapsed dependencies
                collapsed_deps = sentence.find('collapsed-dependencies')
                if collapsed_deps is not None:
                    for dep in collapsed_deps.findall('dep'):
                        dep_type = dep.get('type')
                        governor_idx = dep.find('governor').get('idx')
                        governor_text = dep.find('governor').text
                        dependent_idx = dep.find('dependent').get('idx')
                        dependent_text = dep.find('dependent').text
                        print(f"Collapsed Dependency Type: {dep_type}, Governor idx={governor_idx}: {governor_text}, Dependent idx={dependent_idx}: {dependent_text}")

                # Process collapsed-ccprocessed dependencies
                ccprocessed_deps = sentence.find('collapsed-ccprocessed-dependencies')
                if ccprocessed_deps is not None:
                    for dep in ccprocessed_deps.findall('dep'):
                        dep_type = dep.get('type')
                        governor_idx = dep.find('governor').get('idx')
                        governor_text = dep.find('governor').text
                        dependent_idx = dep.find('dependent').get('idx')
                        dependent_text = dep.find('dependent').text
                        print(f"Collapsed-CCProcessed Dependency Type: {dep_type}, Governor idx={governor_idx}: {governor_text}, Dependent idx={dependent_idx}: {dependent_text}")

                print("\n")  # Print a newline to separate sentences

10000053.xml.gz
Sentence ID: 1
Token ID: 1, Word: Fur, Lemma: Fur, COB: 0, COE_ 3 POS: NNP, NER: O
Token ID: 2, Word: trapper, Lemma: trapper, COB: 4, COE_ 11 POS: NNP, NER: O
Token ID: 3, Word: Jean, Lemma: Jean, COB: 12, COE_ 16 POS: NNP, NER: PERSON
Token ID: 4, Word: La, Lemma: La, COB: 17, COE_ 19 POS: NNP, NER: PERSON
Token ID: 5, Word: B, Lemma: B, COB: 20, COE_ 21 POS: NNP, NER: PERSON
Token ID: 6, Word: te, Lemma: te, COB: 23, COE_ 25 POS: NN, NER: O
Token ID: 7, Word: paddles, Lemma: paddle, COB: 27, COE_ 34 POS: VBZ, NER: O
Token ID: 8, Word: his, Lemma: he, COB: 35, COE_ 38 POS: PRP$, NER: O
Token ID: 9, Word: canoe, Lemma: canoe, COB: 39, COE_ 44 POS: NN, NER: O
Token ID: 10, Word: through, Lemma: through, COB: 45, COE_ 52 POS: IN, NER: O
Token ID: 11, Word: wild, Lemma: wild, COB: 53, COE_ 57 POS: JJ, NER: O
Token ID: 12, Word: water, Lemma: water, COB: 58, COE_ 63 POS: NN, NER: O
Token ID: 13, Word: towards, Lemma: towards, COB: 64, COE_ 71 POS: IN, NER: O
Token ID: 14, 

# Thinking on how to save the data for all the files
TODO:
- Finish XML for all the files
- check how to use wiki and freebase id with wikidata
- check for a scrapper on wikidata + IMDb or other
- Create a notebook with all the saves, probably move the single file notebook to an exploration branch
- create a drawboard with all the files and their columns to clearly see the merges