In [1]:
import os
import glob
from pathlib import Path
import polars as pl
import re
import gzip
import sys
import random
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from lxml import etree, objectify
from rapidfuzz import fuzz
from spacy import displacy

import infineac.file_loader as file_loader
import infineac.helper as helper
import infineac.process_event as process_event
import infineac.topic_extractor as topic_extractor
import infineac.process_text as process_text

%load_ext autoreload
%autoreload 2

PATH_DIR = "../data/transcripts/"
random.seed(111)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


## Load Files

Load all xml files from the given directory and return a list of corresponding events.

In [2]:
# files = [Path("../data/transcripts/2022/15203138_T.xml")]
files = list(Path(PATH_DIR).rglob("*.xml"))
false_part = [files[i] for i in [27246, 27563, 50740, 58498]]
events = file_loader.load_files_from_xml(files)

Files: 100%|██████████| 58613/58613 [02:32<00:00, 384.96it/s]


In [6]:
positions = []
for i, event in enumerate(events):
    print(i+1, end="\r")
    if event['qa'] is not None:
        for speaker in event["qa"]:
            if speaker['position'] not in ['conference', 'cooperation', 'operator', "unknown participant"]:
                positions.append('' + str(i) + ': ' + speaker['position'])

58613

Only consider events that are held after 2022.

In [8]:
events_filt = process_event.filter_events(events, year = 2022, keywords = {})
len(events_filt)

Filtering events


Events: 100%|██████████| 58613/58613 [00:14<00:00, 4079.03it/s]


0

In [None]:
file = [event['file'] for event in events_filt]
id = [event['id'] for event in events_filt]
year_upload = [event['year_upload'] for event in events_filt]
company = [event['company_name'] for event in events_filt]
ticker = [event['company_ticker'] for event in events_filt]
ticker_new = [re.sub('\\..*', '', t) for t in ticker]
dates = [event['date'] for event in events_filt if "date" in event.keys()]
numeric_values = mdates.date2num(dates)

russia_and_sanction = [process_text.get_russia_and_sanction(event['qa_collapsed'] + event['presentation_collapsed']) for event in events_filt]
election = [process_text.get_elections(event['qa_collapsed'] + event['presentation_collapsed']) for event in events_filt]
russia_count = [str(event['qa_collapsed'] + event['presentation_collapsed']).lower().count('russia') for event in events_filt]
sanction_count = [str(event['qa_collapsed'] + event['presentation_collapsed']).lower().count('sanction') for event in events_filt]

In [None]:
events_russia = process_event.filter_events(events, year = 2022, keywords = {'russia': 1, 'ukraine': 1})
len(events_russia)

In [None]:
import spacy_stanza
nlp_stanza = spacy_stanza.load_pipeline("en", processors="tokenize")
nlp_stanza.add_pipe('sentencizer')

In [None]:
re.split("\n", events_russia[0]['presentation'][3]['text'])

In [None]:
process_event.extract_parts_from_paragraphs(re.split("\n", events_russia[0]['presentation'][3]['text']), ['ukraine', "cgi"], nlp=nlp_stanza)

In [None]:
process_event.extract_parts_from_event(events_russia[0], ['ukraine', 'russia'], nlp=nlp_stanza)

In [None]:
type(nlp_stanza("test"))

In [None]:
corpus = process_event.extract_parts_from_events(events_russia, {"russia": 1, "ukraine": 1}, 0, "list", "part", nlp_stanza)
len(corpus)

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

In [None]:
events_russia[0]

In [None]:
corpus[0]

In [None]:
lengths = [len(doc) for doc in corpus]

In [None]:
np.argmax(np.array(lengths))

In [None]:
sns.displot(np.array(lengths))

In [None]:
import spacy
import spacy_stanza
import stanza
stanza.download("en")
# nlp = spacy_stanza.load_pipeline("en", processors="tokenize, pos, lemma, constituency, depparse, sentiment, ner")
nlp = spacy_stanza.load_pipeline("en", processors="tokenize, lemma")

In [None]:
docs = process_text.process_corpus(corpus, nlp)

In [None]:
docs_join = [process_text.list_to_string(doc) for doc in docs]

In [None]:
import dill as pickle
pickle.dump(docs_join, open('../output/pickled/docs_join.pkl', 'wb'))
docs_join = pickle.load(open('../output/pickled/docs_join.pkl', 'rb'))

In [None]:
topic_model, topics, probs = topic_extractor.bert_inspired(docs_join)

In [None]:
topic_model.get_topic_info()

In [None]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs_join, calculate_tokens=True)

In [None]:
topic_model.visualize_distribution(topic_distr[1])

In [None]:
# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(docs_join[1], topic_token_distr[1])
df

In [None]:
topic_model.get_topic(0)

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('paraphrase-distilroberta-base-v1')
sentence1 = "Now this slide talks more about the immediate impact where the combination of sanctions and also business destruction in Ukraine leads or may lead to supply chain disruptions."
sentence2 = "We obviously intensively evaluate all the potential impacts, created a task force that meets almost daily to follow on these topics and basically do preventive measures as to limit the potential impact on CEZ."

doc1 = nlp(sentence1)
doc2 = nlp(sentence2)
similarity = doc1.similarity(doc2)

sentences_ = [sentence1, sentence2]
sentence_embeddings = model.encode(sentences_)

print("Cosine Similarity: " + str(util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1]).item()))
print("similiarity: " + str(similarity))

In [None]:
# import dill as pickle
# pickle.dump(events_russia, open('../output/pickled/events_russia.pkl', 'wb'))
# pickle.dump(events, open('../output/pickled/events.pkl', 'wb'))

Create a DataFrame with the following columns and save it as a csv file:

In [None]:
df = pd.DataFrame({'file': file,
                   'year_upload': year_upload,
                   'company': company,
                   'ticker':ticker,
                   'ticker_new': ticker_new,
                   'date': dates,
                   'dates_num' : numeric_values,
                   'russia': russia_and_sanction,
                   'russia_count': russia_count,
                   'sanction_count': sanction_count,
                   'election': election})

In [None]:
docs = [event['presentation_collapsed'] + "\n" + event['qa_collapsed'] for event in events_russia]

In [None]:
df.to_csv('../output/data/overview.csv', index=False)

In [None]:
#  df[['company', 'ticker', 'ticker_new', 'date', 'russia', 'russia_count', 'sanction_count']].to_csv('../output/data/overview_jakob.csv', index=False)
# with open('../output/data/overview_jakob.csv', 'rb') as f_in:
#     with gzip.open('../output/data/overview_jakob.csv.gz', 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

Create 15 samples for each category: russia and election and save it in the
corresponding directory.

In [None]:
if len(df[df['russia'] == 'russia & sanctions']) >0:sample_files_russia = df[df['russia'] == 'russia'].sample(8)['file'].tolist() + df[df['russia'] == 'russia & sanctions'].sample(7)['file'].tolist()
if len(df[df['election'] == 'presidential election']) >0: sample_files_election = df[df['election'] == 'presidential election'].sample(15)['file'].tolist()

In [None]:
# delete all files in folder
if 'sample_files_russia' in locals() or 'sample_files_russia' in globals():
    folder = '../output/sample transcripts/russia/'
    files = os.listdir(folder)
    for f in files:
        os.remove(folder + f)
    # copy sample files to folder
    for file in sample_files_russia:
        shutil.copy(file, folder)

In [None]:
# delete all files in folder
if 'sample_files_election' in locals() or sample_files_election in globals():
    folder = '../output/sample transcripts/election/'
    files = os.listdir(folder)
    for f in files:
        os.remove(folder + f)
    # copy sample files to folder
    for file in sample_files_election:
        shutil.copy(file, folder)

## Figures

Count the number of occasions where the word "russia" (and "sanction") appears in the earnings call.

In [None]:
sns.histplot(df[df['russia_count']>=1], x='russia_count', hue="russia", bins=50, stat="percent", common_norm=True, multiple="stack")
plt.xlabel('Count of corresponding terms in the earnings calls')
plt.ylabel('Percent')
plt.title('Term count')
plt.show()

Average mentions of "russia" per earnings call.

In [None]:
np.array(russia_count).mean()

Average mentions of "russia" per earnings call if "russia" is mentioned at
least once.

In [None]:
np.array([el for el in russia_count if el > 0]).mean()

Average mentions of "sanction" per earnings call.

In [None]:
np.array([event['body_orig'].lower().count('sanction') for event in events]).mean()

Event distribution over time highlighting the different categories.

In [None]:
# plt.hist(numeric_values, bins=50)
# sns.histplot(numeric_values, bins=50)
sns.displot(df, x="dates_num", hue="russia", bins=50, stat="percent", common_norm=True, multiple="stack")
# plt.plot_date(dates, numeric_values, '-o')  # '-o' adds markers on the data points
plt.xlabel('Dates')
plt.ylabel('Probability')
plt.title('Event Distribution')

# Add x-axis ticks and labels
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))  # Customize date format as needed

plt.gcf().autofmt_xdate()  # Adjusts the date labels rotation for better visibility

plt.show()

Event proportion over time highlighting the different categories.

In [None]:
# plt.hist(numeric_values, bins=50)
# sns.histplot(numeric_values, bins=50)
sns.displot(df, x="dates_num", hue="russia", bins=50, stat="proportion", common_norm=True, multiple="fill")
# plt.plot_date(dates, numeric_values, '-o')  # '-o' adds markers on the data points
plt.xlabel('Dates')
plt.ylabel('Propotion')
plt.title('Event Propotion')

# Add x-axis ticks and labels
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))  # Customize date format as needed

plt.gcf().autofmt_xdate()  # Adjusts the date labels rotation for better visibility

plt.show()

Polars

In [None]:
keys_to_keep = ['file', 'body_orig']

In [None]:
events_new_trimmed = [{key: event[key] for key in keys_to_keep} for event in events_new]

In [None]:
eventsDF = pl.from_dicts(events_new_trimmed)

In [None]:
eventsDF = eventsDF.with_columns(pl.col("body_orig").str.count_match('(E|e)lections').alias("elections_count"))
eventsDF = eventsDF.with_columns(pl.col("body_orig").str.count_match('(S|s)anctions').alias("sanctions_count"))
eventsDF = eventsDF.with_columns(pl.col("body_orig").str.count_match('(R|r)ussia').alias("russia_count"))

In [None]:
len(eventsDF.filter(pl.col("elections_count") > 0))

In [None]:
len(eventsDF.filter((pl.col("sanctions_count") > 0) & (pl.col("russia_count") > 0)))