# Manipulate data into useable form

In [1]:
import numpy as np
import pandas as pd
import os
import nltk

Set up directory paths

In [2]:
current_dir = os.getcwd() # Get the current working directory
parent_dir = os.path.dirname(current_dir) # Get the parent directory
root_path = parent_dir + "/scraper/mf-scraper/mfs-data/"

Extract data

In [3]:
ticker_dirs = os.listdir(root_path)
i = 0 # i is the row number

data_dict = dict()

for ticker in ticker_dirs:
    year_dirs = os.listdir(f"{root_path}/{ticker}/")
    for year in year_dirs:
        quarter_dirs = os.listdir(f"{root_path}/{ticker}/{year}/")
        for quarter in quarter_dirs:
            speaker_dirs = os.listdir(f"{root_path}/{ticker}/{year}/{quarter}")
            for speaker in speaker_dirs:
                remark_files = os.listdir(f"{root_path}/{ticker}/{year}/{quarter}/{speaker}")

                remark_files = sorted(remark_files)
                
                speaker_data = speaker.split("_")
                name = speaker_data[0]
                context = speaker_data[1]
                is_asker = speaker_data[2]

                remark_number = 0
                for remark_file in remark_files:
                    remark_path = f"{root_path}/{ticker}/{year}/{quarter}/{speaker}/{remark_file}"
                    with open(remark_path) as f:
                        text = f.read()
                    # tokenize into sentences
                    sentences = nltk.tokenize.sent_tokenize(text)
                    for index, sentence in enumerate(sentences):
                        data_dict[i] = {"ticker" : ticker, "year": int(year), "quarter": int(quarter), "speaker-name": name, "speaker-context": context, "is-asker": is_asker, "remark-number": remark_number, "sentence-number": index, "sentence": sentence}
                        i += 1
                    remark_number += 1

Into dataframe

In [4]:
df = pd.DataFrame.from_dict(data_dict, orient='index')

In [5]:
df

Unnamed: 0,ticker,year,quarter,speaker-name,speaker-context,is-asker,remark-number,sentence-number,sentence
0,TSLA,2022,4,rod-lache,wolfe-research,True,0,0,I think I'm unmuted.
1,TSLA,2022,4,rod-lache,wolfe-research,True,0,1,Can you hear me?
2,TSLA,2022,4,rod-lache,wolfe-research,True,0,2,"Martin Viecha\nYes, we can."
3,TSLA,2022,4,rod-lache,wolfe-research,True,1,0,OK.
4,TSLA,2022,4,rod-lache,wolfe-research,True,1,1,Thank you.
...,...,...,...,...,...,...,...,...,...
46420,LI,2020,3,kai-wang,chief-technology-officer,False,5,25,Thank you.
46421,LI,2020,3,kai-wang,chief-technology-officer,False,5,26,Unknown speaker\nThank you.
46422,LI,2020,3,kai-wang,chief-technology-officer,False,5,27,Operator\nThank you.
46423,LI,2020,3,kai-wang,chief-technology-officer,False,5,28,We have our next question from the line of Rob...


Write to csv

In [6]:
df.to_csv("ec_sentence_data.csv")