In [3]:
import pandas as pd
import gzip
import os
import string
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm
!jupyter nbextension enable --py widgetsnbextension --sys-prefix
import nltk
from nltk.tokenize import  word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
stop_words.add(string.punctuation)

PATH_IN = "../../MovieSummaries/corenlp_plot_summaries"

Config option `kernel_spec_manager_class` not recognized by `EnableNBExtensionApp`.
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [29]:
def explore_xml_tree(element, visited_tags, indent=""):
    tag = element.tag
    if tag not in visited_tags:
        visited_tags.add(tag)
        print(f"{indent} {tag}")

        for child in element:
            explore_xml_tree(child, visited_tags, indent + "│  ")

# Function to read and explore the XML file
def explore_xml_structure(xml_file_path):
    print("XML File structure")
    with gzip.open(xml_file_path, 'rb') as f:
        data = f.read()
        root = ET.fromstring(data)
        visited_tags = set()
        explore_xml_tree(root, visited_tags)

In [30]:
explore_xml_structure(PATH_IN + "/330.xml.gz")

XML File structure
 root
│   document
│  │   sentences
│  │  │   sentence
│  │  │  │   tokens
│  │  │  │  │   token
│  │  │  │  │  │   word
│  │  │  │  │  │   lemma
│  │  │  │  │  │   CharacterOffsetBegin
│  │  │  │  │  │   CharacterOffsetEnd
│  │  │  │  │  │   POS
│  │  │  │  │  │   NER
│  │  │  │   parse
│  │  │  │   basic-dependencies
│  │  │  │  │   dep
│  │  │  │  │  │   governor
│  │  │  │  │  │   dependent
│  │  │  │   collapsed-dependencies
│  │  │  │   collapsed-ccprocessed-dependencies
│  │   coreference


In [31]:
def tokenize_movie_suumary(path):
    xml_files = [f for f in os.listdir(path) if f.endswith('.xml.gz')]
    len(f" len dataser  = {len(xml_files)}")

    tokens_data = []

    for file_name in tqdm(xml_files, desc='Processing xml.gz files'):
        movie_id = file_name.split('.')[0]
        path = os.path.join(PATH_IN, file_name)
        
        with gzip.open(path, 'rb') as f:
            data = f.read()
            root = ET.fromstring(data)

            for sentence in root.findall('.//sentence'):
                sentence_id = sentence.get('id')
                if sentence_id != None:
                    
                    # appending to df_parses
                    parse = sentence.find('parse').text #if sentence.find('parse') != None else 'N/A'
                    
                    # appending to df_tokens
                    for token in sentence.findall('.//tokens/token'):
                        ner = token.find('NER').text
                        word = token.find('word').text
                        if( ner != "O" and word.lower() not in stop_words ):
                            tokens_data.append({
                                "movie_id": movie_id,
                                "word": word,
                                "sentence_id": sentence_id,
                                "COB": token.find('CharacterOffsetBegin').text,
                                "COE":  token.find('CharacterOffsetEnd').text,
                                "POS": token.find('POS').text,
                                "NER": ner,
                            })
        
    tokens_df = pd.DataFrame(tokens_data)
    return tokens_df

In [40]:
tokens_df = tokenize_movie_suumary(PATH_IN)

Processing xml.gz files:   0%|          | 0/42306 [00:00<?, ?it/s]

In [42]:
tokens_df

Unnamed: 0,movie_id,word,sentence_id,COB,COE,POS,NER
0,3827592,1918,1,19,23,CD,DATE
1,3827592,Ryan,3,128,132,NNP,PERSON
2,5075286,Karan,1,0,5,NNP,PERSON
3,5075286,Sneha,1,13,18,NNP,PERSON
4,5075286,Karan,2,99,104,NNP,PERSON
...,...,...,...,...,...,...,...
1484991,20927392,Rizwan,42,4548,4554,NNP,PERSON
1484992,20927392,Barack,42,4590,4596,NNP,PERSON
1484993,20927392,Obama,42,4597,4602,NNP,PERSON
1484994,20927392,Khan,42,4633,4637,NNP,PERSON
