# Create Combined All Data with NLP Features

In [1]:
# Packages
import pandas as pd
import os
import shutil

## Load NLP Features

In [2]:
# Read '~/Box/STAT 222 Capstone/Intermediate Data/Calls/all_NLP_features.parquet'
# Limit columsn to ticker, earnings_call_date, num_transparency, gf_score, readability, word_count, num_questions, pos_score
all_NLP_features = pd.read_parquet('~/Box/STAT 222 Capstone/Intermediate Data/Calls/all_NLP_features.parquet', columns = ['ticker', 'earnings_call_date', 'num_transparency', 'gf_score', 'readability', 'word_count', 'num_questions', 'pos_score'])
print(all_NLP_features.head())
print('length of all_NLP_features:', len(all_NLP_features))

  ticker earnings_call_date  num_transparency   gf_score  readability  \
0   AAPL         2014-04-23              0.11  12.369779    12.369779   
1   AAPL         2014-07-22              0.10  12.781526    12.781526   
2   AAPL         2014-10-20              0.12  12.537288    12.537288   
3   AAPL         2015-01-27              0.11  11.759278    11.759278   
4   AAPL         2015-04-27              0.11  12.013334    12.013334   

   word_count  num_questions  pos_score  
0        8869             21   0.413608  
1        7587             29   0.340539  
2        9399             26   0.435410  
3        9075             42   0.340825  
4        8548             33   0.482874  
length of all_NLP_features: 7316


In [3]:
# Read '~/Box/STAT 222 Capstone/Intermediate Data/Calls/word_tone.parquet'
word_tone = pd.read_parquet('~/Box/STAT 222 Capstone/Intermediate Data/Calls/word_tone.parquet')
print(word_tone.head())
print('length of word_tone:', len(word_tone))

   Positiv  Negativ  Strong  Weak  Active  Passive  Ovrst  Undrst        PN  \
0      373       48     701    55     573      189    458     179  7.770833   
1      298       54     641    42     495      186    364     131  5.518519   
2      353       66     733    46     656      199    465     152  5.348485   
3      326       83     714    88     591      208    468     151  3.927711   
4      315       60     640    70     565      214    415     135  5.250000   

          SW        AP        OU     TONE1  
0  12.745455  3.031746  2.558659  3.452048  
1  15.261905  2.661290  2.778626  3.188264  
2  15.934783  3.296482  3.059211  3.681858  
3   8.113636  2.841346  3.099338  1.307366  
4   9.142857  2.640187  3.074074  2.025933  
length of word_tone: 7316


In [4]:
# Concatenate all_NLP_features and word_tone
nlp_features_df = pd.concat([all_NLP_features, word_tone], axis = 1)
nlp_features_df

Unnamed: 0,ticker,earnings_call_date,num_transparency,gf_score,readability,word_count,num_questions,pos_score,Positiv,Negativ,...,Weak,Active,Passive,Ovrst,Undrst,PN,SW,AP,OU,TONE1
0,AAPL,2014-04-23,0.11,12.369779,12.369779,8869,21,0.413608,373,48,...,55,573,189,458,179,7.770833,12.745455,3.031746,2.558659,3.452048
1,AAPL,2014-07-22,0.10,12.781526,12.781526,7587,29,0.340539,298,54,...,42,495,186,364,131,5.518519,15.261905,2.661290,2.778626,3.188264
2,AAPL,2014-10-20,0.12,12.537288,12.537288,9399,26,0.435410,353,66,...,46,656,199,465,152,5.348485,15.934783,3.296482,3.059211,3.681858
3,AAPL,2015-01-27,0.11,11.759278,11.759278,9075,42,0.340825,326,83,...,88,591,208,468,151,3.927711,8.113636,2.841346,3.099338,1.307366
4,AAPL,2015-04-27,0.11,12.013334,12.013334,8548,33,0.482874,315,60,...,70,565,214,415,135,5.250000,9.142857,2.640187,3.074074,2.025933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7311,ZTS,2015-08-04,0.09,15.912200,15.912200,8228,12,0.192155,372,103,...,52,623,214,298,148,3.611650,15.634615,2.911215,2.013514,1.744657
7312,ZTS,2015-11-03,0.14,13.175559,13.175559,10625,41,0.110071,501,133,...,79,737,264,395,222,3.766917,15.848101,2.791667,1.779279,1.596294
7313,ZTS,2016-02-16,0.16,12.896152,12.896152,11455,45,0.176927,517,145,...,73,840,287,469,217,3.565517,17.506849,2.926829,2.161290,2.287146
7314,ZTS,2016-05-04,0.12,12.342619,12.342619,10370,33,0.200915,418,117,...,68,765,253,449,215,3.572650,15.235294,3.023715,2.088372,1.739992


In [5]:
# Check uniqueness on ticker and earnings_call_date
print('unique on ticker and earnings_call_date:')
print(len(nlp_features_df) == len(nlp_features_df[['ticker', 'earnings_call_date']].drop_duplicates()))

unique on ticker and earnings_call_date:
True


### More Features - Questions by Call Length, FINBERT Positivity

In [11]:
# Add number of questions divided by call length
nlp_features_df['num_q_by_len'] = nlp_features_df['num_questions'] / nlp_features_df['word_count']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nlp_features_df['num_q_by_len'] = nlp_features_df['num_questions'] / nlp_features_df['word_count']


In [22]:
# Load finbert positivity
finbert_pos = pd.read_parquet('../../../Data/All_Data/All_Data_with_NLP_Finbert.parquet')
print(finbert_pos.columns)
finbert_pos = finbert_pos[['ticker', 'earnings_call_date', 'pos_score_finbert']]
# type of earnings_call_date
print(type(finbert_pos['earnings_call_date'].iloc[0]))
# convert to string
finbert_pos['earnings_call_date'] = finbert_pos['earnings_call_date'].astype(str)
finbert_pos

Index(['ticker', 'fixed_quarter_date', 'earnings_call_date', 'Rating',
       'rating_date', 'num_transparency', 'gf_score', 'word_count',
       'num_questions', 'pos_score', 'Positiv', 'Negativ', 'Strong', 'Weak',
       'Active', 'Passive', 'Ovrst', 'Undrst', 'PN', 'SW', 'AP', 'OU', 'TONE1',
       'pos_score_finbert'],
      dtype='object')
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Unnamed: 0,ticker,earnings_call_date,pos_score_finbert
0,ROST,2015-08-20,0.613959
1,ROST,2015-11-19,0.577236
2,ROST,2016-05-19,0.323893
3,ROST,2016-08-18,0.612279
4,SAVE,2016-07-29,0.385351
...,...,...,...
562,MTW,2013-07-30,0.571026
563,MTW,2013-10-25,0.449093
564,MTW,2014-01-31,0.645526
565,MTW,2014-05-02,0.645359


In [23]:
# Add finbert positivity to nlp_features_df
nlp_features_df = pd.merge(nlp_features_df, finbert_pos, on = ['ticker', 'earnings_call_date'], how = 'left')
nlp_features_df

Unnamed: 0,ticker,earnings_call_date,num_transparency,gf_score,readability,word_count,num_questions,pos_score,Positiv,Negativ,...,Passive,Ovrst,Undrst,PN,SW,AP,OU,TONE1,num_q_by_len,pos_score_finbert
0,AAPL,2014-04-23,0.11,12.369779,12.369779,8869,21,0.413608,373,48,...,189,458,179,7.770833,12.745455,3.031746,2.558659,3.452048,0.002368,
1,AAPL,2014-07-22,0.10,12.781526,12.781526,7587,29,0.340539,298,54,...,186,364,131,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822,0.765917
2,AAPL,2014-10-20,0.12,12.537288,12.537288,9399,26,0.435410,353,66,...,199,465,152,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766,0.731819
3,AAPL,2015-01-27,0.11,11.759278,11.759278,9075,42,0.340825,326,83,...,208,468,151,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628,0.690750
4,AAPL,2015-04-27,0.11,12.013334,12.013334,8548,33,0.482874,315,60,...,214,415,135,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861,0.822168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7310,ZTS,2015-08-04,0.09,15.912200,15.912200,8228,12,0.192155,372,103,...,214,298,148,3.611650,15.634615,2.911215,2.013514,1.744657,0.001458,0.895791
7311,ZTS,2015-11-03,0.14,13.175559,13.175559,10625,41,0.110071,501,133,...,264,395,222,3.766917,15.848101,2.791667,1.779279,1.596294,0.003859,0.929419
7312,ZTS,2016-02-16,0.16,12.896152,12.896152,11455,45,0.176927,517,145,...,287,469,217,3.565517,17.506849,2.926829,2.161290,2.287146,0.003928,0.585873
7313,ZTS,2016-05-04,0.12,12.342619,12.342619,10370,33,0.200915,418,117,...,253,449,215,3.572650,15.235294,3.023715,2.088372,1.739992,0.003182,0.666177


### Sum Stats

In [24]:
# Summary statistics all variables, no scientific notation
pd.options.display.float_format = '{:.2f}'.format
print(nlp_features_df.describe())
# Drop if TONE1 > 25
nlp_features_df = nlp_features_df[nlp_features_df['TONE1'] <= 25]
# Redo sum stats
print(nlp_features_df.describe())

# Check num_q_by_len
# Turn scientific notation back on
pd.options.display.float_format = '{:.2e}'.format
print(nlp_features_df['num_q_by_len'].describe())

# Back to default settings
pd.options.display.float_format = None

       num_transparency  gf_score  readability  word_count  num_questions  \
count           7315.00   7315.00      7315.00     7315.00        7315.00   
mean               0.12     12.55        12.55     8769.63          36.22   
std                0.05      1.31         1.31     2500.11          16.65   
min                0.01      8.55         8.55      525.00           0.00   
25%                0.09     11.64        11.64     7307.00          25.00   
50%                0.11     12.46        12.46     9037.00          35.00   
75%                0.15     13.33        13.33    10285.00          46.00   
max                0.40     19.29        19.29    22147.00         111.00   

       pos_score  Positiv  Negativ  Strong    Weak  ...  Passive   Ovrst  \
count    7315.00  7315.00  7315.00 7315.00 7315.00  ...  7315.00 7315.00   
mean        0.19   332.31   105.35  709.76   96.23  ...   210.74  378.56   
std         0.10   105.56    41.38  226.94   37.46  ...    71.89  118.00   
mi

## Merge with all data file

In [25]:
# list of files in '../../../Data/All_Data/All_Data_Fixed_Quarter_Dates'
file_list = [f for f in os.listdir(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates') if f.endswith('.parquet')]
# read in all parquet files
all_data_fixed_quarter_dates = pd.concat([pd.read_parquet(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates/' + f) for f in file_list])
all_data_fixed_quarter_dates

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Rating Rank AAA is 10,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,...,Ratio_A_diff,Ratio_B_diff,Ratio_C_diff,Ratio_D_diff,Ratio_E_diff,grossProfitRatio_diff,ebitdaratio_diff,operatingIncomeRatio_diff,incomeBeforeTaxRatio_diff,netIncomeRatio_diff
0,AAPL,2014-07-01,2014-04-23,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,,,,,,,,,,
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,-0.019782,-0.053376,-0.026209,-0.034974,-0.036665,0.000469,-0.012459,-0.023107,-0.022640,-0.016974
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,0.001951,0.013472,2.613857,-0.075792,-0.067707,-0.013593,-0.014871,-0.009628,-0.007736,-0.005982
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,9,AA,2015-05-28,AA,2014-05-27,...,0.031136,0.103154,-3.252263,0.015464,-0.004858,0.018625,0.047734,0.059961,0.054951,0.040605
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,9,AA,2015-08-25,AA,2015-05-28,...,-0.009316,-0.062749,3.252263,-0.002312,0.015321,0.009113,-0.001718,-0.009934,-0.007283,-0.007704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,7,BBB,2015-11-03,BBB,2014-01-31,...,0.000338,0.006700,-0.012856,-0.006284,-0.008902,-0.005872,-0.007788,-0.005240,-0.231690,-0.181217
831,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,7,BBB,2016-01-22,BBB,2015-01-30,...,0.006495,0.003489,-0.599780,0.003700,0.020550,0.016617,0.030978,0.028038,0.247855,0.187173
832,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,-0.028536,-0.020573,-0.413072,-0.009082,-0.030337,-0.042537,-0.143885,-0.078861,-0.170716,-0.138415
833,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,0.032007,-0.005091,0.077438,0.028728,0.025617,0.054557,0.214569,0.119659,0.231554,0.158291


In [26]:
# Merge
# Convert earnings_call_date to datetime in both
all_data_fixed_quarter_dates['earnings_call_date'] = pd.to_datetime(all_data_fixed_quarter_dates['earnings_call_date'])
nlp_features_df['earnings_call_date'] = pd.to_datetime(nlp_features_df['earnings_call_date'])
all_data_with_NLP_features = pd.merge(all_data_fixed_quarter_dates, nlp_features_df, how = 'left', on = ['ticker', 'earnings_call_date'])
all_data_with_NLP_features

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Rating Rank AAA is 10,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,...,Passive,Ovrst,Undrst,PN,SW,AP,OU,TONE1,num_q_by_len,pos_score_finbert
0,AAPL,2014-07-01,2014-04-23,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,189.0,458.0,179.0,7.770833,12.745455,3.031746,2.558659,3.452048,0.002368,
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,186.0,364.0,131.0,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822,0.765917
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,199.0,465.0,152.0,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766,0.731819
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,9,AA,2015-05-28,AA,2014-05-27,...,208.0,468.0,151.0,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628,0.690750
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,9,AA,2015-08-25,AA,2015-05-28,...,214.0,415.0,135.0,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861,0.822168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6673,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,7,BBB,2015-11-03,BBB,2014-01-31,...,214.0,298.0,148.0,3.611650,15.634615,2.911215,2.013514,1.744657,0.001458,0.895791
6674,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,7,BBB,2016-01-22,BBB,2015-01-30,...,264.0,395.0,222.0,3.766917,15.848101,2.791667,1.779279,1.596294,0.003859,0.929419
6675,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,287.0,469.0,217.0,3.565517,17.506849,2.926829,2.161290,2.287146,0.003928,0.585873
6676,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,253.0,449.0,215.0,3.572650,15.235294,3.023715,2.088372,1.739992,0.003182,0.666177


## Remove extraneous variables

In [27]:
# Remove extraneous variables 'Rating Rank AAA is 10', 'rating_on_previous_fixed_quarter_date AAA is 10', 'readability'
all_data_with_NLP_features = all_data_with_NLP_features.drop(['Rating Rank AAA is 10', 'rating_on_previous_fixed_quarter_date AAA is 10', 'readability'], axis = 1)

## Drop Items Missing Any Required Variables

In [28]:
# Print all columns
for col in all_data_with_NLP_features.columns:
    print(col)
# Output to Excel
all_cols = pd.DataFrame(all_data_with_NLP_features.columns)
all_cols.columns = ['column_name']
all_cols.to_excel('all_data_with_NLP_features_columns_before_drop_of_missing.xlsx', index = False)

ticker
fixed_quarter_date
earnings_call_date
Rating
rating_date
Next Rating
Next Rating Date
Previous Rating
Previous Rating Date
next_rating_date_or_end_of_data
credit_rating_year
previous_fixed_quarter_date
days_since_call_on_fixed_quarter
days_since_rating
for_quarter
for_year
transcript
reportedCurrency
acceptedDate_balance_sheet
cashAndCashEquivalents
shortTermInvestments
cashAndShortTermInvestments
netReceivables
inventory_balance_sheet
otherCurrentAssets
totalCurrentAssets
propertyPlantEquipmentNet
goodwill
intangibleAssets
goodwillAndIntangibleAssets
longTermInvestments
taxAssets
otherNonCurrentAssets
totalNonCurrentAssets
otherAssets
totalAssets
accountPayables
shortTermDebt
taxPayables
deferredRevenue
otherCurrentLiabilities
totalCurrentLiabilities
longTermDebt
deferredRevenueNonCurrent
deferredTaxLiabilitiesNonCurrent
otherNonCurrentLiabilities
totalNonCurrentLiabilities
otherLiabilities
capitalLeaseObligations
totalLiabilities
preferredStock
commonStock
retainedEarnings
acc

The above output was used to construct the Variable Index file.

In [29]:
# Load variable index
variable_index = pd.read_excel('../../../Variable Index.xlsx')
variable_index

Unnamed: 0,column_name,Clean Column Name,Variable Type,Data Type,Notes,Rating Model 1,Rating Model 2,Rating Model 3,Change Model 1,Change Model 2,Change Model 3
0,Altman_Z,Altman's Z Score,Altman's Z Score,Numeric,,X,,,X,,
1,EBIT,EBIT,Constructed for Altman's Z,Numeric,,,X,X,,X,X
2,common_plus_preferred_stock,Common Plus Preferred Stock,Constructed for Altman's Z,Numeric,,,X,X,,X,X
3,workingCapital,Working Capital,Constructed for Altman's Z,Numeric,,,X,X,,X,X
4,Ratio_A,Ratio A,Constructed for Altman's Z,Numeric,,,X,X,,X,X
...,...,...,...,...,...,...,...,...,...,...,...
165,grossProfitRatio_diff,Difference in Gross Profit Ratio from prior fi...,Change Ratios,Numeric,"Primarily for changes models, but can be used ...",,,,,X,X
166,ebitdaratio_diff,Difference in EBITDA Ratio from prior fixed qu...,Change Ratios,Numeric,"Primarily for changes models, but can be used ...",,,,,X,X
167,operatingIncomeRatio_diff,Difference in Operating Income Ratio from prio...,Change Ratios,Numeric,"Primarily for changes models, but can be used ...",,,,,X,X
168,incomeBeforeTaxRatio_diff,Difference in Income Before Tax Ratio from pri...,Change Ratios,Numeric,"Primarily for changes models, but can be used ...",,,,,X,X


In [30]:
# Get disallowed variables
disallowed_variables = list(variable_index[variable_index['Variable Type'] == 'Disallowed']['column_name'])

# Drop observations missing anything else
for col in all_data_with_NLP_features.columns:
    if col not in disallowed_variables:
        all_data_with_NLP_features_drop = all_data_with_NLP_features[all_data_with_NLP_features[col].notnull()]
        if len(all_data_with_NLP_features_drop) < len(all_data_with_NLP_features):
            print(col, 'missing - dropped ', len(all_data_with_NLP_features) - len(all_data_with_NLP_features_drop))
        all_data_with_NLP_features = all_data_with_NLP_features_drop
all_data_with_NLP_features

totalCurrentAssets missing - dropped  52
totalCurrentLiabilities missing - dropped  5
revenue missing - dropped  118
marketCap missing - dropped  360
rating_on_previous_fixed_quarter_date missing - dropped  442
Altman_Z_diff missing - dropped  177
num_transparency missing - dropped  35


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,next_rating_date_or_end_of_data,...,Passive,Ovrst,Undrst,PN,SW,AP,OU,TONE1,num_q_by_len,pos_score_finbert
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,186.0,364.0,131.0,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822,0.765917
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,199.0,465.0,152.0,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766,0.731819
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,AA,2015-05-28,AA,2014-05-27,2015-05-28,...,208.0,468.0,151.0,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628,0.690750
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,AA,2015-08-25,AA,2015-05-28,2015-08-25,...,214.0,415.0,135.0,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861,0.822168
5,AAPL,2015-10-01,2015-07-21,AA,2015-08-25,AA,2016-05-20,AA,2015-06-02,2016-05-20,...,219.0,449.0,148.0,4.209877,10.442857,2.579909,3.033784,1.815531,0.003915,0.808114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6673,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,BBB,2015-11-03,BBB,2014-01-31,2015-11-03,...,214.0,298.0,148.0,3.611650,15.634615,2.911215,2.013514,1.744657,0.001458,0.895791
6674,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,BBB,2016-01-22,BBB,2015-01-30,2016-01-22,...,264.0,395.0,222.0,3.766917,15.848101,2.791667,1.779279,1.596294,0.003859,0.929419
6675,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,BBB,2016-12-23,BBB,2015-11-03,2016-12-23,...,287.0,469.0,217.0,3.565517,17.506849,2.926829,2.161290,2.287146,0.003928,0.585873
6676,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,BBB,2016-12-23,BBB,2015-11-03,2016-12-23,...,253.0,449.0,215.0,3.572650,15.235294,3.023715,2.088372,1.739992,0.003182,0.666177


In [31]:
# min and max of fixed_quarter_date
print('min of fixed_quarter_date:', min(all_data_with_NLP_features['fixed_quarter_date']))
print('max of fixed_quarter_date:', max(all_data_with_NLP_features['fixed_quarter_date']))

min of fixed_quarter_date: 2010-10-01
max of fixed_quarter_date: 2016-10-01


## Stratified Random Sampling on Rating

In [32]:
# Set seed to 2
import random
random.seed(2)
# If column 'train_test_80_20' exists, remove it
if 'train_test_80_20' in all_data_with_NLP_features.columns:
    all_data_with_NLP_features = all_data_with_NLP_features.drop('train_test_80_20', axis = 1)
# Perform stratified random sampling to create new column train_test_80_20
# Split into dataframes by rating
list_of_rating_dfs = [all_data_with_NLP_features[all_data_with_NLP_features['Rating'] == rating] for rating in all_data_with_NLP_features['Rating'].unique()]
rating_dfs_with_split = []
# Iterate over dfs
for rating_df in list_of_rating_dfs:
    # Create column train_test_80_20 with 80% probability of train
    # Create list of random uniforms of length of rating_df
    rand_uniforms = [random.uniform(0, 1) for _ in range(rating_df.shape[0])]
    # Train and test based off of uniforms
    rating_df['train_test_80_20'] = ['train' if x < 0.8 else 'test' for x in rand_uniforms]
    rating_dfs_with_split.append(rating_df)
# Concatenate the dataframes
all_data_with_NLP_features = pd.concat(rating_dfs_with_split).copy()
print('value counts of train_test_80_20 by rating')
print(all_data_with_NLP_features[['Rating', 'train_test_80_20']].value_counts().sort_index())
all_data_with_NLP_features


value counts of train_test_80_20 by rating
Rating  train_test_80_20
A       test                 214
        train                826
AA      test                  42
        train                174
AAA     test                  27
        train                 85
B       test                 138
        train                629
BB      test                 298
        train               1149
BBB     test                 329
        train               1421
C       test                   7
        train                  9
CC      train                  6
CCC     test                  34
        train                 93
D       test                   3
        train                  5
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df['train_test_80_20'] = ['train' if x < 0.8 else 'test' for x in rand_uniforms]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df['train_test_80_20'] = ['train' if x < 0.8 else 'test' for x in rand_uniforms]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df['train_test_80_20']

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,next_rating_date_or_end_of_data,...,Ovrst,Undrst,PN,SW,AP,OU,TONE1,num_q_by_len,pos_score_finbert,train_test_80_20
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,364.0,131.0,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822,0.765917,test
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,465.0,152.0,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766,0.731819,test
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,AA,2015-05-28,AA,2014-05-27,2015-05-28,...,468.0,151.0,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628,0.690750,train
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,AA,2015-08-25,AA,2015-05-28,2015-08-25,...,415.0,135.0,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861,0.822168,train
5,AAPL,2015-10-01,2015-07-21,AA,2015-08-25,AA,2016-05-20,AA,2015-06-02,2016-05-20,...,449.0,148.0,4.209877,10.442857,2.579909,3.033784,1.815531,0.003915,0.808114,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3739,MHO,2013-10-01,2013-07-25,CC,2013-05-07,CC,2013-11-06,CCC,2012-11-29,2013-11-06,...,180.0,82.0,7.153846,10.225000,2.418033,2.195122,2.140598,0.002420,0.875061,train
3740,MHO,2014-01-01,2013-10-24,CC,2013-11-06,CCC,2014-07-31,CC,2013-05-07,2014-07-31,...,294.0,166.0,4.516129,6.771084,3.039474,1.771084,-0.029287,0.003039,0.472842,train
3741,MHO,2014-04-01,2014-01-29,CC,2013-11-06,CCC,2014-07-31,CC,2013-05-07,2014-07-31,...,325.0,158.0,3.835821,7.078947,3.264706,2.056962,0.084888,0.002848,0.615996,train
3742,MHO,2014-07-01,2014-04-24,CC,2013-11-06,CCC,2014-07-31,CC,2013-05-07,2014-07-31,...,194.0,103.0,3.703704,6.944444,2.590551,1.883495,-0.319794,0.001551,0.382700,train


## Fragment and store on GitHub

In [33]:
def split_df(df, dataset_name, out_folder, num_pieces):
    '''
    Splits dataframes into num_pieces and saves them as parquet files in out_folder. Reduces file size to comply with GitHub limits.
    '''
    # Tracking total length of pieces
    total_len_pieces = 0
    # Create out_folder if it does not exist
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)
    # Delete previous pieces, all contents of out_folder
    for filename in os.listdir(out_folder):
        file_path = os.path.join(out_folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    # Save pieces
    for i in range(num_pieces):
        # start index for piece rows
        start_index = i * len(df) // num_pieces
        # end index for piece rows
        end_index = (i + 1) * len(df) // num_pieces
        # get piece
        piece = df[start_index:end_index]
        piece.to_parquet(out_folder + '/' + dataset_name + '_piece_' + str(i) + '.parquet', index=False)
        #print(len(piece))
        total_len_pieces += len(piece)
    # check total piece length and length of vit_embeddings_df
    print('length check passed')
    print(total_len_pieces == len(df))

In [34]:
split_df(all_data_with_NLP_features, 'All_Data_with_NLP_Features', '../../../Data/All_Data/All_Data_with_NLP_Features', 10)

length check passed
True
