# Create Combined All Data with NLP Features

In [1]:
# Packages
import pandas as pd
import os
import shutil

## Load NLP Features

In [2]:
# Read '~/Box/STAT 222 Capstone/Intermediate Data/Calls/all_NLP_features.parquet'
# Limit columsn to ticker, earnings_call_date, num_transparency, gf_score, readability, word_count, num_questions, pos_score
all_NLP_features = pd.read_parquet('~/Box/STAT 222 Capstone/Intermediate Data/Calls/all_NLP_features.parquet', columns = ['ticker', 'earnings_call_date', 'num_transparency', 'gf_score', 'readability', 'word_count', 'num_questions', 'pos_score'])
print(all_NLP_features.head())
print('length of all_NLP_features:', len(all_NLP_features))

  ticker earnings_call_date  num_transparency   gf_score  readability  \
0   AAPL         2014-04-23              0.11  12.369779    12.369779   
1   AAPL         2014-07-22              0.10  12.781526    12.781526   
2   AAPL         2014-10-20              0.12  12.537288    12.537288   
3   AAPL         2015-01-27              0.11  11.759278    11.759278   
4   AAPL         2015-04-27              0.11  12.013334    12.013334   

   word_count  num_questions  pos_score  
0        8869             21   0.413608  
1        7587             29   0.340539  
2        9399             26   0.435410  
3        9075             42   0.340825  
4        8548             33   0.482874  
length of all_NLP_features: 7316


In [3]:
# Read '~/Box/STAT 222 Capstone/Intermediate Data/Calls/word_tone.parquet'
word_tone = pd.read_parquet('~/Box/STAT 222 Capstone/Intermediate Data/Calls/word_tone.parquet')
print(word_tone.head())
print('length of word_tone:', len(word_tone))

   Positiv  Negativ  Strong  Weak  Active  Passive  Ovrst  Undrst        PN  \
0      373       48     701    55     573      189    458     179  7.770833   
1      298       54     641    42     495      186    364     131  5.518519   
2      353       66     733    46     656      199    465     152  5.348485   
3      326       83     714    88     591      208    468     151  3.927711   
4      315       60     640    70     565      214    415     135  5.250000   

          SW        AP        OU     TONE1  
0  12.745455  3.031746  2.558659  3.452048  
1  15.261905  2.661290  2.778626  3.188264  
2  15.934783  3.296482  3.059211  3.681858  
3   8.113636  2.841346  3.099338  1.307366  
4   9.142857  2.640187  3.074074  2.025933  
length of word_tone: 7316


In [4]:
# Concatenate all_NLP_features and word_tone
nlp_features_df = pd.concat([all_NLP_features, word_tone], axis = 1)
nlp_features_df

Unnamed: 0,ticker,earnings_call_date,num_transparency,gf_score,readability,word_count,num_questions,pos_score,Positiv,Negativ,...,Weak,Active,Passive,Ovrst,Undrst,PN,SW,AP,OU,TONE1
0,AAPL,2014-04-23,0.11,12.369779,12.369779,8869,21,0.413608,373,48,...,55,573,189,458,179,7.770833,12.745455,3.031746,2.558659,3.452048
1,AAPL,2014-07-22,0.10,12.781526,12.781526,7587,29,0.340539,298,54,...,42,495,186,364,131,5.518519,15.261905,2.661290,2.778626,3.188264
2,AAPL,2014-10-20,0.12,12.537288,12.537288,9399,26,0.435410,353,66,...,46,656,199,465,152,5.348485,15.934783,3.296482,3.059211,3.681858
3,AAPL,2015-01-27,0.11,11.759278,11.759278,9075,42,0.340825,326,83,...,88,591,208,468,151,3.927711,8.113636,2.841346,3.099338,1.307366
4,AAPL,2015-04-27,0.11,12.013334,12.013334,8548,33,0.482874,315,60,...,70,565,214,415,135,5.250000,9.142857,2.640187,3.074074,2.025933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7311,ZTS,2015-08-04,0.09,15.912200,15.912200,8228,12,0.192155,372,103,...,52,623,214,298,148,3.611650,15.634615,2.911215,2.013514,1.744657
7312,ZTS,2015-11-03,0.14,13.175559,13.175559,10625,41,0.110071,501,133,...,79,737,264,395,222,3.766917,15.848101,2.791667,1.779279,1.596294
7313,ZTS,2016-02-16,0.16,12.896152,12.896152,11455,45,0.176927,517,145,...,73,840,287,469,217,3.565517,17.506849,2.926829,2.161290,2.287146
7314,ZTS,2016-05-04,0.12,12.342619,12.342619,10370,33,0.200915,418,117,...,68,765,253,449,215,3.572650,15.235294,3.023715,2.088372,1.739992


In [5]:
# Check uniqueness on ticker and earnings_call_date
print('unique on ticker and earnings_call_date:')
print(len(nlp_features_df) == len(nlp_features_df[['ticker', 'earnings_call_date']].drop_duplicates()))

unique on ticker and earnings_call_date:
True


## Merge with all data file

In [6]:
# list of files in '../../../Data/All_Data/All_Data_Fixed_Quarter_Dates'
file_list = [f for f in os.listdir(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates') if f.endswith('.parquet')]
# read in all parquet files
all_data_fixed_quarter_dates = pd.concat([pd.read_parquet(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates/' + f) for f in file_list])
all_data_fixed_quarter_dates

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Rating Rank AAA is 10,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,...,Ratio_E,Altman_Z,filingDate,rating_on_previous_fixed_quarter_date,Investment_Grade,rating_on_previous_fixed_quarter_date AAA is 10,Change Direction Since Last Fixed Quarter Date,Change Since Last Fixed Quarter Date,Sector,train_test_80_20
0,AAPL,2014-07-01,2014-04-23,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,0.480288,4.551851,2014-04-24,,True,,,,Information Technology,train
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,0.443623,4.324703,2014-07-23,AA,True,9.0,Same As Last Fixed Quarter Date,0.0,Information Technology,train
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,0.375916,5.727053,2014-10-27,AA,True,9.0,Same As Last Fixed Quarter Date,0.0,Information Technology,train
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,9,AA,2015-05-28,AA,2014-05-27,...,0.371059,3.992323,2015-01-28,AA,True,9.0,Same As Last Fixed Quarter Date,0.0,Information Technology,train
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,9,AA,2015-08-25,AA,2015-05-28,...,0.386379,5.869492,2015-04-28,AA,True,9.0,Same As Last Fixed Quarter Date,0.0,Information Technology,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
831,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,7,BBB,2015-11-03,BBB,2014-01-31,...,0.120491,3.500267,2015-08-06,BBB,True,7.0,Same As Last Fixed Quarter Date,0.0,Health Care,train
832,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,7,BBB,2016-01-22,BBB,2015-01-30,...,0.141041,3.198497,2015-11-05,BBB,True,7.0,Same As Last Fixed Quarter Date,0.0,Health Care,train
833,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,0.110704,2.782747,2016-02-24,BBB,True,7.0,Same As Last Fixed Quarter Date,0.0,Health Care,test
834,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,0.136321,3.000131,2016-05-06,BBB,True,7.0,Same As Last Fixed Quarter Date,0.0,Health Care,train


In [7]:
# Merge
# Convert earnings_call_date to datetime in both
all_data_fixed_quarter_dates['earnings_call_date'] = pd.to_datetime(all_data_fixed_quarter_dates['earnings_call_date'])
nlp_features_df['earnings_call_date'] = pd.to_datetime(nlp_features_df['earnings_call_date'])
all_data_with_NLP_features = pd.merge(all_data_fixed_quarter_dates, nlp_features_df, how = 'left', on = ['ticker', 'earnings_call_date'])
all_data_with_NLP_features

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Rating Rank AAA is 10,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,...,Weak,Active,Passive,Ovrst,Undrst,PN,SW,AP,OU,TONE1
0,AAPL,2014-07-01,2014-04-23,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,55.0,573.0,189.0,458.0,179.0,7.770833,12.745455,3.031746,2.558659,3.452048
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,42.0,495.0,186.0,364.0,131.0,5.518519,15.261905,2.661290,2.778626,3.188264
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,46.0,656.0,199.0,465.0,152.0,5.348485,15.934783,3.296482,3.059211,3.681858
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,9,AA,2015-05-28,AA,2014-05-27,...,88.0,591.0,208.0,468.0,151.0,3.927711,8.113636,2.841346,3.099338,1.307366
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,9,AA,2015-08-25,AA,2015-05-28,...,70.0,565.0,214.0,415.0,135.0,5.250000,9.142857,2.640187,3.074074,2.025933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6678,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,7,BBB,2015-11-03,BBB,2014-01-31,...,52.0,623.0,214.0,298.0,148.0,3.611650,15.634615,2.911215,2.013514,1.744657
6679,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,7,BBB,2016-01-22,BBB,2015-01-30,...,79.0,737.0,264.0,395.0,222.0,3.766917,15.848101,2.791667,1.779279,1.596294
6680,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,73.0,840.0,287.0,469.0,217.0,3.565517,17.506849,2.926829,2.161290,2.287146
6681,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,68.0,765.0,253.0,449.0,215.0,3.572650,15.235294,3.023715,2.088372,1.739992


## Drop Items Missing Any Required Variables

In [9]:
# Print all columns
for col in all_data_with_NLP_features.columns:
    print(col)
# Output to Excel
all_cols = pd.DataFrame(all_data_with_NLP_features.columns)
all_cols.columns = ['column_name']
all_cols.to_excel('all_data_with_NLP_features_columns_before_drop_of_missing.xlsx', index = False)

ticker
fixed_quarter_date
earnings_call_date
Rating
rating_date
Rating Rank AAA is 10
Next Rating
Next Rating Date
Previous Rating
Previous Rating Date
next_rating_date_or_end_of_data
credit_rating_year
previous_fixed_quarter_date
days_since_call_on_fixed_quarter
days_since_rating
for_quarter
for_year
transcript
reportedCurrency
acceptedDate_balance_sheet
cashAndCashEquivalents
shortTermInvestments
cashAndShortTermInvestments
netReceivables
inventory_balance_sheet
otherCurrentAssets
totalCurrentAssets
propertyPlantEquipmentNet
goodwill
intangibleAssets
goodwillAndIntangibleAssets
longTermInvestments
taxAssets
otherNonCurrentAssets
totalNonCurrentAssets
otherAssets
totalAssets
accountPayables
shortTermDebt
taxPayables
deferredRevenue
otherCurrentLiabilities
totalCurrentLiabilities
longTermDebt
deferredRevenueNonCurrent
deferredTaxLiabilitiesNonCurrent
otherNonCurrentLiabilities
totalNonCurrentLiabilities
otherLiabilities
capitalLeaseObligations
totalLiabilities
preferredStock
commonStoc

In [78]:
# Variables not used: 'Next Rating', 'Next Rating Date', 'next_rating_date_or_end_of_data'
# Potentially don't use 'Previous Rating', 'Previous Rating Date', 'previous_rating_date_or_start_of_data' also since they lead to a lot of loss
# Drop observations missing anything else
for col in all_data_with_NLP_features.columns:
    if col not in ['Next Rating', 'Next Rating Date', 'next_rating_date_or_end_of_data']:
        all_data_with_NLP_features_drop = all_data_with_NLP_features[all_data_with_NLP_features[col].notnull()]
        if len(all_data_with_NLP_features_drop) < len(all_data_with_NLP_features):
            print(col, 'missing - dropped ', len(all_data_with_NLP_features) - len(all_data_with_NLP_features_drop))
        all_data_with_NLP_features = all_data_with_NLP_features_drop
all_data_with_NLP_features

Previous Rating missing - dropped  1393
totalCurrentAssets missing - dropped  45
totalCurrentLiabilities missing - dropped  5
revenue missing - dropped  93
marketCap missing - dropped  275
rating_on_previous_fixed_quarter_date missing - dropped  116
num_transparency missing - dropped  32


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Rating Rank AAA is 10,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,...,Weak,Active,Passive,Ovrst,Undrst,PN,SW,AP,OU,TONE1
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,42.0,495.0,186.0,364.0,131.0,5.518519,15.261905,2.661290,2.778626,3.188264
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,46.0,656.0,199.0,465.0,152.0,5.348485,15.934783,3.296482,3.059211,3.681858
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,9,AA,2015-05-28,AA,2014-05-27,...,88.0,591.0,208.0,468.0,151.0,3.927711,8.113636,2.841346,3.099338,1.307366
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,9,AA,2015-08-25,AA,2015-05-28,...,70.0,565.0,214.0,415.0,135.0,5.250000,9.142857,2.640187,3.074074,2.025933
5,AAPL,2015-10-01,2015-07-21,AA,2015-08-25,9,AA,2016-05-20,AA,2015-06-02,...,70.0,565.0,219.0,449.0,148.0,4.209877,10.442857,2.579909,3.033784,1.815531
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6678,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,7,BBB,2015-11-03,BBB,2014-01-31,...,52.0,623.0,214.0,298.0,148.0,3.611650,15.634615,2.911215,2.013514,1.744657
6679,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,7,BBB,2016-01-22,BBB,2015-01-30,...,79.0,737.0,264.0,395.0,222.0,3.766917,15.848101,2.791667,1.779279,1.596294
6680,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,73.0,840.0,287.0,469.0,217.0,3.565517,17.506849,2.926829,2.161290,2.287146
6681,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,68.0,765.0,253.0,449.0,215.0,3.572650,15.235294,3.023715,2.088372,1.739992


In [79]:
# min and max of fixed_quarter_date
print('min of fixed_quarter_date:', min(all_data_with_NLP_features['fixed_quarter_date']))
print('max of fixed_quarter_date:', max(all_data_with_NLP_features['fixed_quarter_date']))

min of fixed_quarter_date: 2011-01-01
max of fixed_quarter_date: 2016-10-01


## Fragment and store on GitHub

In [80]:
def split_df(df, dataset_name, out_folder, num_pieces):
    '''
    Splits dataframes into num_pieces and saves them as parquet files in out_folder. Reduces file size to comply with GitHub limits.
    '''
    # Tracking total length of pieces
    total_len_pieces = 0
    # Create out_folder if it does not exist
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)
    # Delete previous pieces, all contents of out_folder
    for filename in os.listdir(out_folder):
        file_path = os.path.join(out_folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    # Save pieces
    for i in range(num_pieces):
        # start index for piece rows
        start_index = i * len(df) // num_pieces
        # end index for piece rows
        end_index = (i + 1) * len(df) // num_pieces
        # get piece
        piece = df[start_index:end_index]
        piece.to_parquet(out_folder + '/' + dataset_name + '_piece_' + str(i) + '.parquet', index=False)
        #print(len(piece))
        total_len_pieces += len(piece)
    # check total piece length and length of vit_embeddings_df
    print('length check passed')
    print(total_len_pieces == len(df))

In [81]:
split_df(all_data_with_NLP_features, 'All_Data_with_NLP_Features', '../../../Data/All_Data/All_Data_with_NLP_Features', 10)

length check passed
True
