# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -U PyYAML

Collecting PyYAML
[?25l  Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)
[K     |▌                               | 10kB 23.7MB/s eta 0:00:01[K     |█                               | 20kB 30.6MB/s eta 0:00:01[K     |█▌                              | 30kB 35.4MB/s eta 0:00:01[K     |██                              | 40kB 24.0MB/s eta 0:00:01[K     |██▋                             | 51kB 14.3MB/s eta 0:00:01[K     |███                             | 61kB 12.3MB/s eta 0:00:01[K     |███▋                            | 71kB 13.7MB/s eta 0:00:01[K     |████▏                           | 81kB 15.0MB/s eta 0:00:01[K     |████▋                           | 92kB 15.7MB/s eta 0:00:01[K     |█████▏                          | 102kB 16.4MB/s eta 0:00:01[K     |█████▋                          | 112kB 16.4MB/s eta 0:00:01[K     |██████▏                       

In [3]:
# Load repository
!git clone https://github.com/dafrie/fin-disclosures-nlp.git &> /dev/null
%cd /content/fin-disclosures-nlp
!git pull

/content/fin-disclosures-nlp
Already up to date.


In [4]:
 !python -m spacy download en_core_web_md &> /dev/null

# Extraction

In [5]:
import os
import sys

import pandas as pd
import numpy as np

sys.path.append('./')

%load_ext autoreload
%autoreload 2

import data

try:
  from data import dataframe_preparation
except OSError:
  exit()

DATA_INPUT_PATH = "/content/drive/MyDrive/fin-disclosures-nlp/data/inference/annual_reports_600_extracted"
HEADER_INPUT_PATH = "/content/drive/MyDrive/fin-disclosures-nlp/data/inference/stox300_reports.csv"
INFERENCE_PARAGRAPHS = "/content/drive/MyDrive/fin-disclosures-nlp/data/inference/stox300_reports_paragraphs.csv"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
from data.preprocessing import DocumentPreprocessor
from data.dataframe_preparation import get_counts_per_page, get_keywords_from_file, get_text_from_page, get_count_matrix
from datetime import datetime
from tqdm.notebook import trange, tqdm_notebook


vocabulary = get_keywords_from_file("./data/keyword_vocabulary.txt")

def get_paragraphs_of_report(report_row, add_adjunct_pages=True):
    result = []
    
    # Load report
    path = os.path.join(DATA_INPUT_PATH,report_row["company"])
    parsed_report_file_path = os.path.join(path, report_row['input_file'])
    
    # Get pages with keyword hits
    pages = get_counts_per_page(parsed_report_file_path, vocabulary)
    page_indizes = set(pages.index)
    
    # Add adjunct pages if necessary
    if add_adjunct_pages:
        for p in pages.index:
            if p > 0:
                page_indizes.add(p - 1)
            # elif p < TOTAL_PAGES:
            page_indizes.add(p + 1)
            
    # For each page, get all paragraphs
    for page_no in page_indizes:
        try:
            text = get_text_from_page(parsed_report_file_path, page_no)
            processed_doc = DocumentPreprocessor(text).process()
        except IndexError:
            continue
        paragraphs = processed_doc.split('\n\n')
        
        for idx, p in enumerate(paragraphs):
            result.append({ "page_no": page_no, "paragraph_no": idx, "text": p, "is_adjunct": False if page_no in pages.index else True })
        # print(f"Page no: {page_no}")
    return result


def get_report_df():
  df = pd.read_csv(HEADER_INPUT_PATH)
  df = df.set_index("id")
  return df

def save_report_df(df):
  df.to_csv(HEADER_INPUT_PATH)

In [None]:
should_reset = False
if should_reset:
  df_header = get_report_df()
  df_header.loc[df_header['status'] == 'processing', 'status'] = "should_extract"
  save_report_df(df_header)

In [None]:
lock_per_firm = False
has_unprocessed_items = True
while has_unprocessed_items:
  df = get_report_df()
  df = df.query("status == 'should_extract'")
  
  if len(df) < 1:
    print("Haven't found any reports to extract from")
    break

  current_company = str(np.random.choice(df.company.unique()))
  company_reports = df.query(f"company == '{current_company}'")
  print(f"Processing company {current_company}  with {len(company_reports)} reports. Overall remaining: {len(df.company.unique())} (firms)/{len(df)} (reports)")
  
  # Lock all company reports
  if lock_per_firm:
    df = get_report_df()
    df.loc[(df['company'] == current_company) & (df['status'] == 'should_extract'), 'status'] = "processing"
    save_report_df(df)

  for index, row in tqdm_notebook(company_reports.iterrows(), total=company_reports.shape[0]):
    if not lock_per_firm:
      df = get_report_df()
      print(row.name)
      #df.loc[df.iloc == row.name, 'status'] = "processing"
      df.loc[row.name, 'status'] = "processing"
      save_report_df(df)
    
    paragraphs = get_paragraphs_of_report(row, add_adjunct_pages=True)
    if len(paragraphs):
      df_report_paragraphs = pd.DataFrame(paragraphs)
      df_report_paragraphs["report_id"] = index
      df_report_paragraphs.to_csv(os.path.join(DATA_INPUT_PATH, row["company"], row["input_file"].replace(".yml", ".csv")))
    
    if not lock_per_firm:
      df = get_report_df()
      df.loc[row.name, 'status'] = "processed"
      save_report_df(df)

  df = get_report_df()
  if lock_per_firm:
    df.loc[(df['company'] == current_company) & (df['status'] == 'processing'), 'status'] = 'processed'
  save_report_df(df)

# Debugging

In [9]:
"""
df_bug = get_report_df()
df_bug.loc[df.query("status == 'processing'").index, 'status'] = "error"
save_report_df(df_bug)
"""

In [None]:
"""
missing_reports = []
df_bug = get_report_df()
for index, row in tqdm_notebook(df_bug.iterrows(), total=df_bug.shape[0]):
  path = os.path.join(DATA_INPUT_PATH, row["company"])
  parsed_report_file_path = os.path.join(path, row['input_file'])
  
  if not os.path.isfile(parsed_report_file_path.replace(".yml", ".csv")):
    missing_reports.append(index)

df = get_report_df()
df.loc[missing_reports, 'status'] = "should_extract"
save_report_df(df)
"""

# Assemble final file

In [12]:
all_csv_files = []
df_csv = get_report_df()
for index, row in tqdm_notebook(df_csv.iterrows(), total=df_csv.shape[0]):
  path = os.path.join(DATA_INPUT_PATH, row["company"])
  parsed_report_file_path = os.path.join(path, row['input_file']).replace(".yml", ".csv")
  if os.path.isfile(parsed_report_file_path):
    all_csv_files.append(parsed_report_file_path)

HBox(children=(FloatProgress(value=0.0, max=4099.0), HTML(value='')))




In [13]:
combined_csv = pd.concat( [ pd.read_csv(f) for f in all_csv_files ] )

In [14]:
combined_csv = combined_csv.drop(["Unnamed: 0"], axis=1)

In [15]:
combined_csv

Unnamed: 0,page_no,paragraph_no,text,is_adjunct,report_id
0,68,0,Directors’ Remuneration report continued,True,IWGPlc-AR_2017
1,68,1,Policy Table for the Chairman and Non-Executiv...,True,IWGPlc-AR_2017
2,68,2,"Chairman fees Reviewed, but not necessarily in...",True,IWGPlc-AR_2017
3,68,3,\nA single fee which reflects all Board and Co...,True,IWGPlc-AR_2017
4,68,4,There is no prescribed \nmaximum although fees...,True,IWGPlc-AR_2017
...,...,...,...,...,...
45,15,12,SIX Real Estate index,True,FabegeAB-AR_2005
46,15,13,SIX Return Index\ndecnovoctsepaugjuljunmayaprm...,True,FabegeAB-AR_2005
47,15,14,Source: SIX / Hallvarsson & Halvarsson,True,FabegeAB-AR_2005
48,15,15,Wihlborgs is distributed\nto the shareholders ...,True,FabegeAB-AR_2005


In [16]:
combined_csv.to_csv(INFERENCE_PARAGRAPHS, index=False)