In [1]:
# pip install tabula-py

In [2]:
# pip install tabulate

In [3]:
from tabula import read_pdf
from tabulate import tabulate
from os import listdir
from os.path import isfile, join
import pandas as pd

In [4]:
def get_raw_prediction_dfs(path):
    dfs = read_pdf(
        path,
        pages='all', 
        lattice=True,
        pandas_options = {'header':None},
        multiple_tables = True,
        silent=True
    )
    
    # usually data df has many columns, tunable. 
    valid_col_num = 7
    valid_dfs = filter(lambda df: df.shape[-1] > valid_col_num, dfs)
    raw_prediction_dfs = list(map(lambda df: df.dropna(how='all'), valid_dfs))
    return raw_prediction_dfs

In [5]:
raw_prediction_dfs = get_raw_prediction_dfs('./data_es/LikeFolioSundayEarningsSheet20210313.pdf')
raw_prediction_dfs[0].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,,LikeFolio Sunday Earnings Sheet,,,,,,,,,,,
1,,Week 9\r2021-03-15,,,,,,,,,,,
2,,Company,Earnings\rScore,Buzz\rGrowth,Demand\rGrowth,Happiness\rGrowth,YoY Change,Expected\rMove,Short\rInterest,Andy's\rTrade,Landon's\rTrade,System\rTrade,Commentary
3,,Designer Brands\rDBI\r$15.36\rTue before market,-51,,,,PI: -48.2%\rStock: +65.4%,+ or -\r$2.20\r(14.3%),14.5%\rShort\r(02/26/21),Buy OTM\rPuts,Very\rBearish\rSpread,Coin Flip\rBearish,The re-opening narrative has been pushing the ...
5,,Kodak\rKODK\r$9.76\rTue after market,+65,,,,PI: +3.1%\rStock: +378.4%,+ or -\r$2.47\r(25.3%),22.9%\rShort\r(02/26/21),,,Very\rBullish\rSpread,"Got squeezed to high heaven last July, followi..."


In [6]:
LINE_SEPARATOR = '\r'

In [7]:
import datetime as dt
def get_date(data):
    fmts = ('%Y','%b %d, %Y','%b %d, %Y','%B %d, %Y','%B %d %Y','%m/%d/%Y','%m/%d/%y','%b %Y','%B%Y','%b %d,%Y', 
            '%m-%d-%Y', '%Y-%m-%d')
    for fmt in fmts:
        try:
            t = dt.datetime.strptime(data, fmt)
            date = str(t).split(' ')[0]
            return date
        except:
            pass
    return None

In [8]:
get_date('2021-05-10')


'2021-05-10'

In [9]:
from enum import Enum
class ColType(Enum):
    MORE_LINE_TICKER = 1
    THREE_LINE_TICKER = 2
    NAN = 3
    DATE = 4
    OTHER = 5
    
def col_type(col):
    if pd.isna(col):
        return ColType.NAN
    else:
        values = col.split(LINE_SEPARATOR)        
        if len(values) == 3 and values[1].isupper() and values[2].startswith('$'):
            return ColType.THREE_LINE_TICKER
        elif len(values) in [4, 5] and values[1].isupper() and values[2].startswith('$'):
            return ColType.MORE_LINE_TICKER
        elif len(values) == 2 and get_date(values[1]) :
            return ColType.DATE
        else:
            return ColType.OTHER

In [10]:
test_cols = [float('nan'), 'Kodak\rKODK\r$9.76\rTue after market', 'Week 9\r2021-03-15']
for col in test_cols:
    print(col_type(col))

ColType.NAN
ColType.MORE_LINE_TICKER
ColType.DATE


In [11]:
def get_week_and_df_builder(raw_df):
    stop_token = 'Unconfirmed Earnings'
    is_unconfirmed = False
    df_builder = []
    week = ''
    for idx, row in raw_df.iterrows():
#         print(row.tolist())
        if is_unconfirmed:
            break
        row_builder = []
        cols = row.tolist()
        # pointer to col
        idx = 0
        while idx < len(cols):
            cur_col = cols[idx]
            cur_type = col_type(cur_col)
#             print(cur_type)
            if cur_type == ColType.DATE:
                week = get_date(cur_col.split(LINE_SEPARATOR)[-1])
                break
            elif cur_type == ColType.THREE_LINE_TICKER:
                values = cur_col.split(LINE_SEPARATOR)
                # Company, ticker
                row_builder = [values[0], values[1]]
                # Get weekday info 
                row_builder.append(cols[idx + 1])
                # Get earning score
                row_builder.append(cols[idx + 2])
                df_builder.append(row_builder)
                break
            elif cur_type == ColType.MORE_LINE_TICKER:
                values = cur_col.split(LINE_SEPARATOR)
                # Company, ticker, weekday
                row_builder = [values[0], values[1], values[3]]
                # Get earning score
                row_builder.append(cols[idx + 1])
                df_builder.append(row_builder)
                break
            elif stop_token in str(cur_col):
                is_unconfirmed = True
                break
            idx += 1
        
    
    return week, df_builder, is_unconfirmed

In [12]:
def earning_prediction_df_from_file(path):
    raw_prediction_dfs = get_raw_prediction_dfs(path)
    week_value = None 
    final_df_builder = []
    for df in raw_prediction_dfs:
        week, df_builder, is_following_unconfirmed = get_week_and_df_builder(df)
        if week:
            week_value = week
        final_df_builder.extend(df_builder)
        if is_following_unconfirmed:
            break
    final_df = pd.DataFrame(final_df_builder, columns = ['Company', 'Ticker', 'Earning Time', 'Earning Score'])
    final_df['Starting Week'] = week_value
    
    return final_df

In [13]:
test_df = earning_prediction_df_from_file('./data_es\/LikeFolioSundayEarningsSheet_20210509.pdf')
test_df.head()

Unnamed: 0,Company,Ticker,Earning Time,Earning Score,Starting Week
0,Nautilus,NLS,Mon after market,38,2021-05-10
1,Callaway,ELY,Mon after market,24,2021-05-10
2,Affirm,AFRM,Mon after market,12,2021-05-10
3,Hanes,HBI,Tue before market,-11,2021-05-10
4,The RealReal,REAL,Mon after market,-18,2021-05-10


In [27]:
DATA_FOLDER = './data_es'
PATHS = [join(DATA_FOLDER, f) for f in listdir(DATA_FOLDER) if isfile(join(DATA_FOLDER, f))]
earning_prediction_dfs = []
merged_df = None
success_counter = 0
for path in PATHS:
    print(f'Processing {path} ...')
    try:
        earning_prediction_df = earning_prediction_df_from_file(path)
        print(earning_prediction_df.shape)
        if earning_prediction_df.shape[0] > 0: 
            earning_prediction_dfs.append(earning_prediction_df)
            success_counter += 1
        else:
            print(f'[Warning]: Empty dataframe extracted {path}')
    except:
        print(f'Cannot process {path}')
if len(earning_prediction_dfs) > 0:
    merged_df = pd.concat(earning_prediction_dfs)
print(f'Processed {success_counter} files out of {len(PATHS)} raw files')

merged_df

Processing ./data_es\LikeFolioSundayEarningsSheet-02-21-21.pdf ...
(34, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet20200120.pdf ...
(11, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet20210313.pdf ...
(8, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet_20200112_2.pdf ...
(3, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet_20200124.pdf ...
(28, 5)
Processing ./data_es\LikefolioSundayEarningsSheet_20200202.pdf ...
(35, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet_20200209.pdf ...
(23, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet_20200216.pdf ...
(16, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet_20200223.pdf ...
(34, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet_20200301.pdf ...
(17, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet_20200308.pdf ...
(12, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet_20200315.pdf ...
(12, 5)
Processing ./data_es\LikeFolioSundayEarningsSheet_20200412.pdf ...
(5, 5)
Processing ./data_es\LikeFol

Unnamed: 0,Company,Ticker,Earning Time,Earning Score,Starting Week
0,Macy's,M,Tue before market,+29,2021-02-22
1,IHG Hotels,IHG,Tue before market,+20,2021-02-22
2,Home Depot,HD,Tue before market,-7,2021-02-22
3,Crocs,CROX,Tue before market,-34,2021-02-22
4,Arlo Technologies,ARLO,Tue after market,+36,2021-02-22
...,...,...,...,...,...
7,Lands End,LE,Thu before market,-54,2021-08-30
8,Chewy,CHWY,Wed after market,-59,2021-08-30
9,Tilly's,TLYS,Thu after market,+32,2021-08-30
10,Oxford,OXM,Thu after market,-2,2021-08-30


In [28]:
# Further filter to remove data of unconfirmed date
print(f'Before trimming there are {merged_df.shape[0]} rows')
df_trim_unconfirmed = merged_df[merged_df['Earning Time'].apply(len) > 5] 
final_df = df_trim_unconfirmed.drop_duplicates()
print(f'After trimming there are {final_df.shape[0]} rows')

Before trimming there are 1305 rows
After trimming there are 1299 rows


In [29]:
import datetime
from dateutil import parser

def er_date(starting_week_col, weekday_col):
    try:
        week_dt = parser.parse(starting_week_col)
        weekday = weekday_col.split(' ')[0]
        offset_map = {
            'Mon': 0,
            'Tue': 1,
            'Wed': 2,
            'Thu': 3,
            'Fri': 4
        }
        er_dt = week_dt + datetime.timedelta(offset_map[weekday])
        return str(er_dt).split(' ')[0]
    except:
        return ''

In [30]:
er_date('2021-02-22', 'Tue before market')

'2021-02-23'

In [31]:
final_df['Earning Date'] = final_df.apply(lambda x: er_date(x['Starting Week'], x['Earning Time']), axis=1)

In [32]:
PREMARKET = 'PREMARKET'
AFH = 'AFH'

def market_time(earning_time_col):
    if earning_time_col:
        if 'before' in earning_time_col:
            return PREMARKET
        if 'after' in earning_time_col:
            return AFH
    else:
        return ''

In [33]:
final_df['Market Time'] = final_df.apply(lambda x: market_time(x['Earning Time']), axis=1)

In [34]:
# final_df.to_csv('earning_prediction.csv', index=False)

### Install yahoo finance API

In [35]:
# pip install --user yfinance 

In [36]:
import yfinance as yf  

In [37]:
def get_quotes(row):
    ticker, earning_dt, market_time = row['Ticker'], row['Earning Date'], row['Market Time']
    if earning_dt and market_time:
        earning_dt = parser.parse(earning_dt)
        # actual is end_dt - 1, need +1 to offset API requirment
        if market_time == PREMARKET:
            start_dt = earning_dt + datetime.timedelta(-1)
            end_dt = earning_dt + datetime.timedelta(1)
        elif market_time == AFH:
            start_dt = earning_dt
            end_dt = earning_dt + datetime.timedelta(2)
        else:
            raise 
        start_dt = str(start_dt).split(' ')[0]
        end_dt = str(end_dt).split(' ')[0]
    #     print(start_dt, end_dt)
    
        try:
            stock = yf.Ticker(ticker)
            quotes = stock.history(start=start_dt, end=end_dt).round(2)
            start_quotes = quotes.iloc[0]
            end_quotes = quotes.iloc[1]
            row['Left-day Open'] = start_quotes.Open
            row['Right-day Open'] = end_quotes.Open
            row['Left-day Close'] = start_quotes.Close
            row['Right-day Close'] = end_quotes.Close
            row['Left-day High'] = start_quotes.High
            row['Right-day High'] = end_quotes.High
            row['Left-day Low'] = start_quotes.Low
            row['Right-day Low'] = end_quotes.Low
        except:
            pass

    return row

In [42]:
final_df_with_quotes = final_df.apply(lambda x: get_quotes(x), axis=1)
'Done'

- TUES: No data found, symbol may be delisted
- DNKN: No data found, symbol may be delisted
- GRUB: Data doesn't exist for startDate = 1580889600, endDate = 1581062400
- FIT: No data found, symbol may be delisted
- HTZ: No data found, symbol may be delisted
- RLH: No data found, symbol may be delisted
- ZAGG: No data found, symbol may be delisted
- MIK: No data found, symbol may be delisted
- VLKAY: No data found for this date range, symbol may be delisted
- TLRD: No data found, symbol may be delisted
- DNKN: No data found, symbol may be delisted
- MIK: No data found, symbol may be delisted
- VLKAY: No data found for this date range, symbol may be delisted
- DNKN: No data found, symbol may be delisted
- ZAGG: No data found, symbol may be delisted
- RLH: No data found, symbol may be delisted
- MIK: No data found, symbol may be delisted
- FRAN: No data found, symbol may be delisted
- DNKN: No data found, symbol may be delisted
- FIT: No data found, symbol may be delisted
- ZAGG: No data 

'Done'

In [44]:
# final_df_with_quotes.to_csv('earning_prediction_quotes.csv', index=False)

In [45]:
final_df_with_quotes

Unnamed: 0,Company,Earning Date,Earning Score,Earning Time,Left-day Close,Left-day High,Left-day Low,Left-day Open,Market Time,Right-day Close,Right-day High,Right-day Low,Right-day Open,Starting Week,Ticker
0,Macy's,2021-02-23,+29,Tue before market,15.28,15.72,15.15,15.22,PREMARKET,15.88,16.06,13.73,15.31,2021-02-22,M
1,IHG Hotels,2021-02-23,+20,Tue before market,74.12,75.20,72.05,72.33,PREMARKET,74.05,74.99,73.04,74.99,2021-02-22,IHG
2,Home Depot,2021-02-23,-7,Tue before market,271.32,273.92,269.14,273.64,PREMARKET,262.85,265.24,253.35,264.96,2021-02-22,HD
3,Crocs,2021-02-23,-34,Tue before market,83.14,83.98,81.52,82.32,PREMARKET,80.01,82.03,76.00,78.04,2021-02-22,CROX
4,Arlo Technologies,2021-02-23,+36,Tue after market,6.68,6.96,6.08,6.91,AFH,7.52,8.65,7.26,8.32,2021-02-22,ARLO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,Lands End,2021-09-02,-54,Thu before market,34.23,35.12,33.37,34.18,PREMARKET,31.10,32.91,30.12,30.12,2021-08-30,LE
8,Chewy,2021-09-01,-59,Wed after market,87.43,89.05,86.60,88.29,AFH,79.31,81.37,78.50,79.28,2021-08-30,CHWY
9,Tilly's,2021-09-02,+32,Thu after market,15.74,16.12,15.58,15.80,AFH,14.78,15.67,14.55,15.67,2021-08-30,TLYS
10,Oxford,2021-09-02,-2,Thu after market,93.54,94.31,90.51,93.38,AFH,94.15,102.09,93.96,96.35,2021-08-30,OXM
