In [8]:
import re
import pandas as pd
import os
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [9]:
def extract_sentence_info(line):
    match = re.match(r"Sentence #(\d+) \((\d+) tokens, sentiment: (.+?)\):", line)
    if match:
        return int(match.group(1)), int(match.group(2)), match.group(3)
    return None, None, None

def extract_token_info(line):
    token_pattern = r"\[Text=(.*?) CharacterOffsetBegin=(\d+) CharacterOffsetEnd=(\d+) PartOfSpeech=(.*?) Lemma=(.*?) NamedEntityTag=(.*?) SentimentClass=(.*?)\]"
    match = re.match(token_pattern, line)
    if match:
        return match.groups()
    return None, None, None, None, None, None, None

def extract_dependency_info(line):
    dep_type = line.split("(", 1)[0]
    reverse_1 = line.split("(", 1)[1].split(" ", 1)[0][::-1]
    reverse_2 = line.split("(", 1)[1].split(" ", 1)[1][::-1]
    word_1 = reverse_1[1:].split("-", 1)[1][::-1]
    word_1_idx = reverse_1[1:].split("-", 1)[0][::-1]
    word_2 = reverse_2[1:].split("-", 1)[1][::-1]
    word_2_idx = reverse_2[1:].split("-", 1)[0][::-1]    
    return dep_type, word_1.strip(), int(word_1_idx), word_2.strip(), int(word_2_idx)

def extract_entity_mentions_info(line):
    entity_pattern = r"([^\t]+)\t([^\t]+)\t(?:[^\t:]+:)?(-?\d*\.\d+|-)"
    match = re.match(entity_pattern, line)
    if match:
        word, word_type, probability = match.groups()
        probability = None if probability == '-' else float(probability)
        return word, word_type, probability
    return None, None, None

def extract_coreference_info(line):
    coords_1 = line.split(", that is: ")[0].split(" -> ")[0]
    coords_2 = line.split(", that is: ")[0].split(" -> ")[1]
    word_1 = line.split(", that is: ")[1].split(" -> ")[0][1:-1]
    word_2 = line.split(", that is: ")[1].split(" -> ")[1][1:-1]
    return coords_1, coords_2, word_1, word_2

In [10]:
def process_file(file_path):
    movie_id = re.search(r"nlp_movie_(\d+).txt", file_path).group(1)
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()

        current_section = None
        prev_section = None
        parse_data = ''
        data = {
            'sentences': [],
            'tokens': [],
            'constituency_parse': [],
            'binary_parse': [],
            'sentiment_tree': [],
            'dependencies': [],
            'entity_mentions': [],
            'coreferences': []
        }

        for line in lines:
            line = line.strip()

            if line.startswith("Sentence #"):
                current_section = 'sentence'
                sentence_id, tokens, sentiment = extract_sentence_info(line)
                data['sentences'].append((int(movie_id), int(sentence_id), int(tokens), sentiment))
            elif line.startswith("Tokens"):
                current_section = 'tokens'
            elif line.startswith("Constituency parse"):
                current_section = 'constituency_parse'
            elif line.startswith("Binary Constituency parse"):
                current_section = 'binary_parse'
            elif line.startswith("Sentiment-annotated binary tree"):
                current_section = 'sentiment_tree'
            elif line.startswith("Dependency Parse (enhanced plus plus dependencies):"):
                current_section = 'dependencies'
            elif line.startswith("Extracted the following NER entity mentions:"):
                current_section = 'entity_mentions'
            elif line.startswith("Coreference set:"):
                current_section = 'coreferences'
            else:
                if current_section == 'tokens' and line:
                    word, cob, coe, pos, lemma, ner, sentiment = extract_token_info(line)
                    data['tokens'].append((int(movie_id), int(sentence_id), word, int(cob), int(coe), pos, lemma, ner, sentiment))
                    prev_section = 'tokens'
                elif current_section == 'constituency_parse' and line:
                    parse_data += line + ' '
                    prev_section = 'constituency_parse'
                elif current_section == 'binary_parse' and line:
                    if prev_section == 'constituency_parse':
                        data['constituency_parse'].append((int(movie_id), int(sentence_id), parse_data.strip()))
                        parse_data = ''
                    parse_data += line + ' '
                    prev_section = 'binary_parse'
                elif current_section == 'sentiment_tree' and line:
                    if prev_section == 'binary_parse':
                        data['binary_parse'].append((int(movie_id), int(sentence_id), parse_data.strip()))
                        parse_data = ''
                    parse_data += line + ' '
                    prev_section = 'sentiment_tree'
                elif current_section == 'dependencies' and line:
                    if prev_section == 'sentiment_tree':
                        data['sentiment_tree'].append((int(movie_id), int(sentence_id), parse_data.strip()))
                        parse_data = ''
                    dep_type, word_1, word_1_idx, word_2, word_2_idx = extract_dependency_info(line)
                    if dep_type is None:
                        print(f"None dependency found: movie_id {movie_id}, sentence_id: {setence_id}")
                        print(line)
                    data['dependencies'].append((int(movie_id), int(sentence_id), dep_type, word_1, int(word_1_idx), word_2, int(word_2_idx)))
                    prev_section = 'dependencies'
                elif current_section == 'entity_mentions' and line:
                    word, entity_type, optional_probability = extract_entity_mentions_info(line)
                    if word is None:
                        print(f"None entity found: movie_id {movie_id}, sentence_id: {setence_id}")
                        print(line)
                    data['entity_mentions'].append((int(movie_id), int(sentence_id), word, entity_type, optional_probability))
                    prev_section = 'entity_mentions'
                elif current_section == 'coreferences' and line:
                    coords_1, coords_2, word_1, word_2 = extract_coreference_info(line)
                    data['coreferences'].append((int(movie_id), coords_1, coords_2, word_1, word_2))
                    prev_section = 'coreferences'

        return data
    except UnicodeDecodeError:
        print(f"Cannot read file {file_path}")

In [11]:
folder_path = "../nlp_results/"
nb_workers = 16

columns_sentences = ['Wikipedia_movie_id', 'Sentence_id', 'Nb_tokens', 'Sentiment']
df_sentences = pd.DataFrame(columns=columns_sentences)
columns_tokens = ["Wikipedia_movie_id", "Sentence_id", "Word", "COB", "COE", "POS", "Lemma","NER","Sentiment"]
df_tokens = pd.DataFrame(columns=columns_tokens)
columns_constituency_parse = ["Wikipedia_movie_id", "Sentence_id", "Constituency_parse"]
df_constituency_parse = pd.DataFrame(columns=columns_constituency_parse)
columns_binary_parse = ["Wikipedia_movie_id", "Sentence_id", "Binary_parse"]
df_binary_parse = pd.DataFrame(columns=columns_binary_parse)
columns_sentiment_tree = ["Wikipedia_movie_id", "Sentence_id", "Sentiment_tree"]
df_sentiment_tree = pd.DataFrame(columns=columns_sentiment_tree)
columns_dependencies = ["Wikipedia_movie_id", "Sentence_id", "Dependency_type", "Word_1", "Word_1_idx", "Word_2", "Word_2_idx"]
df_dependencies = pd.DataFrame(columns=columns_dependencies) 
columns_entities = ["Wikipedia_movie_id", "Sentence_id","Word","Entity_type","Optional_probability"]
df_entities = pd.DataFrame(columns=columns_entities)
columns_coreference = ["Wikipedia_movie_id", "Coordinates_1", "Coordinates_2", "Word_1","Word_2"]
df_coreference = pd.DataFrame(columns=columns_coreference)


def handle_file(filename):
    file_path = os.path.join(folder_path, filename)
    if filename.endswith('.txt'):
        #if "nlp_movie_32179375" in filename: # for debug
        return process_file(file_path)
    return None


with ThreadPoolExecutor(max_workers=nb_workers) as executor:
    future_to_file = {executor.submit(handle_file, filename): filename for filename in os.listdir(folder_path)}
    progress = tqdm(total=len(future_to_file), desc="Processing Files")
    for future in as_completed(future_to_file):
        data = future.result()
        if data:
            df_sentences = pd.concat([df_sentences, pd.DataFrame(data['sentences'], columns=columns_sentences)])
            df_tokens = pd.concat([df_tokens, pd.DataFrame(data['tokens'], columns=columns_tokens)])
            df_constituency_parse = pd.concat([df_constituency_parse, pd.DataFrame(data['constituency_parse'], columns=columns_constituency_parse)])
            df_binary_parse = pd.concat([df_binary_parse, pd.DataFrame(data['binary_parse'], columns=columns_binary_parse)])
            df_sentiment_tree = pd.concat([df_sentiment_tree, pd.DataFrame(data['sentiment_tree'], columns=columns_sentiment_tree)])
            df_dependencies = pd.concat([df_dependencies, pd.DataFrame(data['dependencies'], columns=columns_dependencies)])
            df_entities = pd.concat([df_entities, pd.DataFrame(data['entity_mentions'], columns=columns_entities)])
            df_coreference = pd.concat([df_coreference, pd.DataFrame(data['coreferences'], columns=columns_coreference)])
        progress.update(1)

progress.close()

Processing Files:   0%|          | 0/42303 [00:00<?, ?it/s]

18h45 for tokens and dependencies (parallel bug?); 2h20 sentences, entities, coref; 40mins consitutuency and binary parses sentiment

In [7]:
df_sentences.isna().any()
df_sentences['Wikipedia_movie_id'] = pd.to_numeric(df_sentences['Wikipedia_movie_id'], downcast='integer')
df_sentences['Sentence_id'] = pd.to_numeric(df_sentences['Sentence_id'], downcast='integer')
df_sentences['Nb_tokens'] = pd.to_numeric(df_sentences['Nb_tokens'], downcast='integer')
df_sentences['Sentiment'] = df_sentences['Sentiment'].astype('string')
df_sentences.info()
df_sentences.to_parquet('df_sentences.parquet', compression='Brotli')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 662514 entries, 0 to 34
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Wikipedia_movie_id  662514 non-null  int32 
 1   Sentence_id         662514 non-null  int16 
 2   Nb_tokens           662514 non-null  int16 
 3   Sentiment           662514 non-null  string
dtypes: int16(2), int32(1), string(1)
memory usage: 15.2 MB


In [15]:
len(df_tokens)
df_tokens.isna().any()

df_tokens['Wikipedia_movie_id'] = pd.to_numeric(df_tokens['Wikipedia_movie_id'], downcast='integer')
df_tokens['Sentence_id'] = pd.to_numeric(df_tokens['Sentence_id'], downcast='integer')
df_tokens['COB'] = pd.to_numeric(df_tokens['COB'], downcast='integer')
df_tokens['COE'] = pd.to_numeric(df_tokens['COE'], downcast='integer')
columns_to_convert = ['Word', 'POS', 'Lemma', 'NER', 'Sentiment']
df_tokens[columns_to_convert] = df_tokens[columns_to_convert].astype('string')
df_tokens.info()
df_tokens.to_parquet('df_tokens.parquet', compression='Brotli')

15046378

In [6]:
df_constituency_parse['Wikipedia_movie_id'] = pd.to_numeric(df_constituency_parse['Wikipedia_movie_id'], downcast='integer')
df_constituency_parse['Sentence_id'] = pd.to_numeric(df_constituency_parse['Sentence_id'], downcast='integer')
df_constituency_parse['Constituency_parse'] = df_constituency_parse['Constituency_parse'].astype('string')
df_constituency_parse.info()
df_constituency_parse.to_parquet('df_constituency_parse.parquet', compression='Brotli')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 662514 entries, 0 to 50
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Wikipedia_movie_id  662514 non-null  int32 
 1   Sentence_id         662514 non-null  int16 
 2   Constituency_parse  662514 non-null  string
dtypes: int16(1), int32(1), string(1)
memory usage: 13.9 MB


In [7]:
df_binary_parse['Wikipedia_movie_id'] = pd.to_numeric(df_binary_parse['Wikipedia_movie_id'], downcast='integer')
df_binary_parse['Sentence_id'] = pd.to_numeric(df_binary_parse['Sentence_id'], downcast='integer')
df_binary_parse['Binary_parse'] = df_binary_parse['Binary_parse'].astype('string')
df_binary_parse.info()
df_binary_parse.to_parquet('df_binary_parse.parquet', compression='Brotli')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 662514 entries, 0 to 50
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Wikipedia_movie_id  662514 non-null  int32 
 1   Sentence_id         662514 non-null  int16 
 2   Binary_parse        662514 non-null  string
dtypes: int16(1), int32(1), string(1)
memory usage: 13.9 MB


In [8]:
df_sentiment_tree['Wikipedia_movie_id'] = pd.to_numeric(df_sentiment_tree['Wikipedia_movie_id'], downcast='integer')
df_sentiment_tree['Sentence_id'] = pd.to_numeric(df_sentiment_tree['Sentence_id'], downcast='integer')
df_sentiment_tree['Sentiment_tree'] = df_sentiment_tree['Sentiment_tree'].astype('string')
df_sentiment_tree.info()
df_sentiment_tree.to_parquet('df_sentiment_tree.parquet', compression='Brotli')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 662514 entries, 0 to 50
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Wikipedia_movie_id  662514 non-null  int32 
 1   Sentence_id         662514 non-null  int16 
 2   Sentiment_tree      662514 non-null  string
dtypes: int16(1), int32(1), string(1)
memory usage: 13.9 MB


In [30]:
df_dependencies['Wikipedia_movie_id'] = pd.to_numeric(df_dependencies['Wikipedia_movie_id'], downcast='integer')
df_dependencies['Sentence_id'] = pd.to_numeric(df_dependencies['Sentence_id'], downcast='integer')
df_dependencies['Word_1_idx'] = pd.to_numeric(df_dependencies['Word_1_idx'], downcast='integer')
df_dependencies['Word_2_idx'] = pd.to_numeric(df_dependencies['Word_2_idx'], downcast='integer')
columns_to_convert = ['Dependency_type', 'Word_1', 'Word_2']
df_dependencies[columns_to_convert] = df_dependencies[columns_to_convert].astype('string')
df_dependencies.info()
df_dependencies.to_parquet('df_dependencies.parquet', compression='Brotli')

In [13]:
df_entities.isna().any()
df_entities['Wikipedia_movie_id'] = pd.to_numeric(df_entities['Wikipedia_movie_id'], downcast='integer')
df_entities['Sentence_id'] = pd.to_numeric(df_entities['Sentence_id'], downcast='integer')
df_entities['Optional_probability'] = pd.to_numeric(df_entities['Optional_probability'], downcast='float')
df_entities['Word'] = df_entities['Word'].astype('string')
df_entities['Entity_type'] = df_entities['Entity_type'].astype('string')
df_entities.info()
df_entities.to_parquet('df_entities.parquet', compression='Brotli')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2220379 entries, 0 to 99
Data columns (total 5 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Wikipedia_movie_id    int32  
 1   Sentence_id           int16  
 2   Word                  string 
 3   Entity_type           string 
 4   Optional_probability  float32
dtypes: float32(1), int16(1), int32(1), string(2)
memory usage: 72.0 MB


In [17]:
df_coreference.isna().any()
df_coreference['Wikipedia_movie_id'] = pd.to_numeric(df_coreference['Wikipedia_movie_id'], downcast='integer')
columns_to_convert = ['Coordinates_1', 'Coordinates_2', 'Word_1', 'Word_2']
df_coreference[columns_to_convert] = df_coreference[columns_to_convert].astype('string')
df_coreference.info()
df_coreference.to_parquet('df_coreference.parquet', compression='Brotli')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1879672 entries, 0 to 76
Data columns (total 6 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   Wikipedia_movie_id  int32 
 1   Sentence_id         int16 
 2   Coordinates_1       string
 3   Coordinates_2       string
 4   Word_1              string
 5   Word_2              string
dtypes: int16(1), int32(1), string(4)
memory usage: 82.5 MB
