# Create Combined All Data with NLP Features

In [37]:
# Packages
import pandas as pd
import os
import shutil

## Load NLP Features

In [38]:
# Read '../../../Data/NLP Only/NLP_Finbert_only.parquet'
nlp_features_df = pd.read_parquet('../../../Data/NLP Only/NLP_Finbert_only.parquet')
print('col names: ', nlp_features_df.columns)
nlp_features_df

col names:  Index(['ticker', 'Rating', 'Sector', 'pos_score_finbert', 'num_transparency',
       'gf_score', 'word_count', 'num_questions', 'Positiv', 'Negativ',
       'Strong', 'Weak', 'Active', 'Passive', 'Ovrst', 'Undrst', 'PN', 'SW',
       'AP', 'OU', 'tone', 'num_q_by_len', 'len_by_q_count',
       'fixed_quarter_date'],
      dtype='object')


Unnamed: 0,ticker,Rating,Sector,pos_score_finbert,num_transparency,gf_score,word_count,num_questions,Positiv,Negativ,...,Ovrst,Undrst,PN,SW,AP,OU,tone,num_q_by_len,len_by_q_count,fixed_quarter_date
0,CENX,BB,Materials,0.326735,0.08,11.057234,7426.0,19.0,278.0,111.0,...,313.0,163.0,2.504505,5.372340,2.778947,1.920245,-1.096219,0.002559,390.842105,2016-07-01
1,CENX,BB,Materials,0.395129,0.06,11.795560,6035.0,10.0,225.0,86.0,...,299.0,125.0,2.616279,6.731343,2.759740,2.392000,-0.280967,0.001657,603.500000,2016-10-01
2,CHDN,BB,Consumer Discretionary,0.515874,0.09,13.912787,5535.0,8.0,231.0,64.0,...,271.0,121.0,3.609375,7.885246,2.196532,2.239669,0.119644,0.001445,691.875000,2015-04-01
3,CHDN,BB,Consumer Discretionary,0.430664,0.06,17.300669,5453.0,8.0,232.0,57.0,...,244.0,114.0,4.070175,10.541667,2.951049,2.140351,0.959579,0.001467,681.625000,2015-07-01
4,CHDN,BB,Consumer Discretionary,0.624336,0.06,12.834639,5464.0,12.0,224.0,52.0,...,212.0,125.0,4.307692,8.466667,2.686275,1.696000,0.102382,0.002196,455.333333,2015-10-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6649,BW,BBB,Industrials,0.301030,0.09,12.995092,7631.0,37.0,310.0,88.0,...,327.0,210.0,3.522727,4.974359,2.347826,1.557143,-1.365582,0.004849,206.243243,2016-07-01
6650,BWA,BBB,Consumer Discretionary,0.552842,0.19,12.232941,10669.0,37.0,311.0,124.0,...,447.0,233.0,2.508065,9.256098,3.015748,1.918455,-0.290637,0.003468,288.351351,2011-10-01
6651,BWA,A,Consumer Discretionary,0.118580,0.16,9.472823,10093.0,57.0,348.0,93.0,...,505.0,242.0,3.741935,5.542857,2.583333,2.086777,-0.607383,0.005647,177.070175,2015-10-01
6652,BYD,B,Consumer Discretionary,1.082187,0.07,13.309999,7655.0,19.0,338.0,48.0,...,345.0,149.0,7.041667,13.053571,2.254464,2.315436,1.895212,0.002482,402.894737,2016-07-01


In [39]:
# Check uniqueness on ticker and fixed_quarter_date
print('unique on ticker and fixed_quarter_date:')
print(len(nlp_features_df) == len(nlp_features_df[['ticker', 'fixed_quarter_date']].drop_duplicates()))

unique on ticker and fixed_quarter_date:
True


In [40]:
# Drop unneeded variables Rating, Sector (in base data), len_by_q_count (has too many Inf/NaN due to zero questions, and we actually want to keep these calls for analysis)
nlp_features_df = nlp_features_df.drop(columns=['Rating', 'Sector', 'len_by_q_count'])

### Sum Stats

In [41]:
# Summary statistics all variables, no scientific notation
pd.options.display.float_format = '{:.2f}'.format
print(nlp_features_df.describe())
# Drop if tone > 25
nlp_features_df = nlp_features_df[nlp_features_df['tone'] <= 25]
# Redo sum stats
print(nlp_features_df.describe())

# Check num_q_by_len
# Turn scientific notation back on
print('formatting q by len')
pd.options.display.float_format = '{:.2e}'.format
print(nlp_features_df['num_q_by_len'].describe())

# Back to default settings
pd.options.display.float_format = None

       pos_score_finbert  num_transparency  gf_score  word_count  \
count            6654.00           6654.00   6654.00     6654.00   
mean                0.54              0.12     12.53     8797.48   
std                 0.26              0.05      1.32     2504.74   
min                -0.29              0.01      8.55      525.00   
25%                 0.36              0.09     11.61     7328.25   
50%                 0.53              0.12     12.44     9046.50   
75%                 0.71              0.15     13.33    10319.75   
max                 1.61              0.40     19.29    22147.00   

       num_questions  Positiv  Negativ  Strong    Weak  Active  Passive  \
count        6654.00  6654.00  6654.00 6654.00 6654.00 6654.00  6654.00   
mean           36.49   332.34   105.58  710.24   96.48  612.07   210.77   
std            16.62   105.62    41.47  227.12   37.68  181.84    72.15   
min             0.00    21.00     7.00   40.00    6.00   29.00     8.00   
25%         

## Merge with all data file

In [42]:
# list of files in '../../../Data/All_Data/All_Data_Fixed_Quarter_Dates'
file_list = [f for f in os.listdir(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates') if f.endswith('.parquet')]
# read in all parquet files
all_data_fixed_quarter_dates = pd.concat([pd.read_parquet(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates/' + f) for f in file_list])
all_data_fixed_quarter_dates

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Rating Rank AAA is 10,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,...,debtRatio_diff,debtRatioAlt_diff,debtEquityRatio_diff,equityMultiplier_diff,enterpriseValueMultiplier_diff,operatingCashFlowPerShare_diff,freeCashFlowPerShare_diff,cashPerShare_diff,operatingCashFlowToSales_diff,freeCashFlowToOperatingCashFlow_diff
0,AAPL,2014-07-01,2014-04-23,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,,,,,,,,,,
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,0.000000,-0.759365,-24.558873,0.125902,0.000000,-0.126331,-0.166740,-0.234071,-0.022623,-0.127290
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,0.000000,0.062362,0.238477,0.238477,0.000000,0.131845,0.070634,0.043692,0.040592,-0.053662
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,9,AA,2015-05-28,AA,2014-05-27,...,0.000000,0.010232,0.045159,0.045159,0.000000,0.884579,0.907173,0.250114,0.137489,0.193896
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,9,AA,2015-08-25,AA,2015-05-28,...,-0.769818,-0.023001,-0.098891,-0.098891,0.000000,-0.619480,-0.586623,-0.208184,-0.123118,-0.032942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,7,BBB,2015-11-03,BBB,2014-01-31,...,-0.014131,0.020778,0.578263,0.579245,1.096292,0.124190,0.122016,-0.027820,0.049383,0.372951
830,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,7,BBB,2016-01-22,BBB,2015-01-30,...,-0.009219,-0.005380,-0.168582,-0.170349,-23.366204,0.164706,0.152516,0.098181,0.064210,0.122147
831,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,0.071666,0.049682,1.967787,1.968981,95.485237,0.149307,0.090904,1.130204,0.050171,-0.036465
832,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,-0.015700,-0.021390,-1.009100,-1.011751,-103.258245,-0.455396,-0.383304,-0.958949,-0.174321,-0.590986


In [43]:
# Merge
all_data_with_NLP_features = pd.merge(all_data_fixed_quarter_dates, nlp_features_df, how = 'left', on = ['ticker', 'fixed_quarter_date'])
all_data_with_NLP_features

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Rating Rank AAA is 10,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,...,Active,Passive,Ovrst,Undrst,PN,SW,AP,OU,tone,num_q_by_len
0,AAPL,2014-07-01,2014-04-23,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,573.0,189.0,458.0,179.0,7.770833,12.745455,3.031746,2.558659,2.629566,0.002368
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,495.0,186.0,364.0,131.0,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,9,AA,2015-02-18,AAA,2014-04-24,...,656.0,199.0,465.0,152.0,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,9,AA,2015-05-28,AA,2014-05-27,...,591.0,208.0,468.0,151.0,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,9,AA,2015-08-25,AA,2015-05-28,...,565.0,214.0,415.0,135.0,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6665,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,7,BBB,2015-11-03,BBB,2014-01-31,...,623.0,214.0,298.0,148.0,3.611650,15.634615,2.911215,2.013514,1.744657,0.001458
6666,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,7,BBB,2016-01-22,BBB,2015-01-30,...,737.0,264.0,395.0,222.0,3.766917,15.848101,2.791667,1.779279,1.596294,0.003859
6667,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,840.0,287.0,469.0,217.0,3.565517,17.506849,2.926829,2.161290,2.287146,0.003928
6668,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,7,BBB,2016-12-23,BBB,2015-11-03,...,765.0,253.0,449.0,215.0,3.572650,15.235294,3.023715,2.088372,1.739992,0.003182


## Remove extraneous variables

In [44]:
# Remove extraneous variables 'Rating Rank AAA is 10', 'rating_on_previous_fixed_quarter_date AAA is 10', 'readability'
all_data_with_NLP_features = all_data_with_NLP_features.drop(['Rating Rank AAA is 10', 'rating_on_previous_fixed_quarter_date AAA is 10'], axis = 1)

## Drop Items Missing Any Required Variables

In [45]:
# Print all columns
for col in all_data_with_NLP_features.columns:
    print(col)
# Output to Excel
all_cols = pd.DataFrame(all_data_with_NLP_features.columns)
all_cols.columns = ['column_name']
all_cols.to_excel('all_data_with_NLP_features_columns_before_drop_of_missing.xlsx', index = False)

ticker
fixed_quarter_date
earnings_call_date
Rating
rating_date
Next Rating
Next Rating Date
Previous Rating
Previous Rating Date
next_rating_date_or_end_of_data
credit_rating_year
previous_fixed_quarter_date
days_since_call_on_fixed_quarter
days_since_rating
for_quarter
for_year
transcript
reportedCurrency
acceptedDate_balance_sheet
cashAndCashEquivalents
shortTermInvestments
cashAndShortTermInvestments
netReceivables
inventory_balance_sheet
otherCurrentAssets
totalCurrentAssets
propertyPlantEquipmentNet
goodwill
intangibleAssets
goodwillAndIntangibleAssets
longTermInvestments
taxAssets
otherNonCurrentAssets
totalNonCurrentAssets
otherAssets
totalAssets
accountPayables
shortTermDebt
taxPayables
deferredRevenue
otherCurrentLiabilities
totalCurrentLiabilities
longTermDebt
deferredRevenueNonCurrent
deferredTaxLiabilitiesNonCurrent
otherNonCurrentLiabilities
totalNonCurrentLiabilities
otherLiabilities
capitalLeaseObligations
totalLiabilities
preferredStock
commonStock
retainedEarnings
acc

The above output was used to construct the Variable Index file.

In [46]:
# Load variable index
variable_index = pd.read_excel('../../../Variable Index.xlsx')
variable_index

Unnamed: 0,column_name,Clean Column Name,Variable Type,Data Type,Ratio?,Notes,Rating Model 1,Rating Model 2,Rating Model 3,Change Model 1,Change Model 2,Change Model 3
0,Altman_Z,Altman's Z Score,Altman's Z Score,Numeric,Y,,X,,,X,,
1,EBIT,EBIT,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
2,common_plus_preferred_stock,Common Plus Preferred Stock,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
3,workingCapital,Working Capital,Constructed for Altman's Z,Numeric,,,,X,X,,X,X
4,Ratio_A,Ratio A,Constructed for Altman's Z,Numeric,Y,,,X,X,,X,X
...,...,...,...,...,...,...,...,...,...,...,...,...
201,operatingCashFlowPerShare_diff,Difference in Operating Cash Flow Per Share fr...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
202,freeCashFlowPerShare_diff,Difference in Free Cash Flow Per Share from pr...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
203,cashPerShare_diff,Difference in Cash Per Share from prior fixed ...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X
204,operatingCashFlowToSales_diff,Difference in Operating Cash Flow to Sales fro...,Additional Change Ratios,Numeric,,"Primarily for changes models, but can be used ...",,,,,X,X


In [47]:
# Get disallowed variables
disallowed_variables = list(variable_index[variable_index['Variable Type'] == 'Disallowed']['column_name'])

# Drop observations missing anything else
for col in all_data_with_NLP_features.columns:
    if col not in disallowed_variables:
        all_data_with_NLP_features_drop = all_data_with_NLP_features[all_data_with_NLP_features[col].notnull()]
        if len(all_data_with_NLP_features_drop) < len(all_data_with_NLP_features):
            print(col, 'missing - dropped ', len(all_data_with_NLP_features) - len(all_data_with_NLP_features_drop))
        all_data_with_NLP_features = all_data_with_NLP_features_drop
all_data_with_NLP_features

totalCurrentAssets missing - dropped  52
totalCurrentLiabilities missing - dropped  5
revenue missing - dropped  118
marketCap missing - dropped  354
Altman_Z_diff missing - dropped  619
pos_score_finbert missing - dropped  13


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,next_rating_date_or_end_of_data,...,Active,Passive,Ovrst,Undrst,PN,SW,AP,OU,tone,num_q_by_len
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,495.0,186.0,364.0,131.0,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,656.0,199.0,465.0,152.0,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,AA,2015-05-28,AA,2014-05-27,2015-05-28,...,591.0,208.0,468.0,151.0,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,AA,2015-08-25,AA,2015-05-28,2015-08-25,...,565.0,214.0,415.0,135.0,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861
5,AAPL,2015-10-01,2015-07-21,AA,2015-08-25,AA,2016-05-20,AA,2015-06-02,2016-05-20,...,565.0,219.0,449.0,148.0,4.209877,10.442857,2.579909,3.033784,1.815531,0.003915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6665,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,BBB,2015-11-03,BBB,2014-01-31,2015-11-03,...,623.0,214.0,298.0,148.0,3.611650,15.634615,2.911215,2.013514,1.744657,0.001458
6666,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,BBB,2016-01-22,BBB,2015-01-30,2016-01-22,...,737.0,264.0,395.0,222.0,3.766917,15.848101,2.791667,1.779279,1.596294,0.003859
6667,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,BBB,2016-12-23,BBB,2015-11-03,2016-12-23,...,840.0,287.0,469.0,217.0,3.565517,17.506849,2.926829,2.161290,2.287146,0.003928
6668,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,BBB,2016-12-23,BBB,2015-11-03,2016-12-23,...,765.0,253.0,449.0,215.0,3.572650,15.235294,3.023715,2.088372,1.739992,0.003182


In [48]:
# min and max of fixed_quarter_date
print('min of fixed_quarter_date:', min(all_data_with_NLP_features['fixed_quarter_date']))
print('max of fixed_quarter_date:', max(all_data_with_NLP_features['fixed_quarter_date']))

min of fixed_quarter_date: 2010-10-01
max of fixed_quarter_date: 2016-10-01


## Stratified Random Sampling on Rating for Train-Test Split

In [49]:
# Set seed
import random
random.seed(3)
# If column 'train_test_80_20' exists, remove it
if 'train_test_80_20' in all_data_with_NLP_features.columns:
    all_data_with_NLP_features = all_data_with_NLP_features.drop('train_test_80_20', axis = 1)
# Perform stratified random sampling to create new column train_test_80_20
# Split into dataframes by rating
list_of_rating_dfs = [all_data_with_NLP_features[all_data_with_NLP_features['Rating'] == rating] for rating in all_data_with_NLP_features['Rating'].unique()]
rating_dfs_with_split = []
# Iterate over dfs
for rating_df in list_of_rating_dfs:
    # Create column train_test_80_20 with 80% probability of train
    # Create list of random uniforms of length of rating_df
    rand_uniforms = [random.uniform(0, 1) for _ in range(rating_df.shape[0])]
    # Train and test based off of uniforms
    rating_df['train_test_80_20'] = ['train' if x < 0.8 else 'test' for x in rand_uniforms]
    rating_dfs_with_split.append(rating_df)
# Concatenate the dataframes
all_data_with_NLP_features = pd.concat(rating_dfs_with_split).copy()
print('value counts of train_test_80_20 by rating')
print(all_data_with_NLP_features[['Rating', 'train_test_80_20']].value_counts().sort_index())
all_data_with_NLP_features


value counts of train_test_80_20 by rating
Rating  train_test_80_20
A       test                 208
        train                833
AA      test                  52
        train                164
AAA     test                  24
        train                 88
B       test                 154
        train                613
BB      test                 284
        train               1173
BBB     test                 363
        train               1396
C       test                   4
        train                 12
CC      test                   2
        train                  4
CCC     test                  26
        train                101
D       test                   1
        train                  7
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df['train_test_80_20'] = ['train' if x < 0.8 else 'test' for x in rand_uniforms]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df['train_test_80_20'] = ['train' if x < 0.8 else 'test' for x in rand_uniforms]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df['train_test_80_20']

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,next_rating_date_or_end_of_data,...,Passive,Ovrst,Undrst,PN,SW,AP,OU,tone,num_q_by_len,train_test_80_20
1,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,186.0,364.0,131.0,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822,train
2,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,199.0,465.0,152.0,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766,train
3,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,AA,2015-05-28,AA,2014-05-27,2015-05-28,...,208.0,468.0,151.0,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628,train
4,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,AA,2015-08-25,AA,2015-05-28,2015-08-25,...,214.0,415.0,135.0,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861,train
5,AAPL,2015-10-01,2015-07-21,AA,2015-08-25,AA,2016-05-20,AA,2015-06-02,2016-05-20,...,219.0,449.0,148.0,4.209877,10.442857,2.579909,3.033784,1.815531,0.003915,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3737,MHO,2013-10-01,2013-07-25,CC,2013-05-07,CC,2013-11-06,CCC,2012-11-29,2013-11-06,...,122.0,180.0,82.0,7.153846,10.225000,2.418033,2.195122,2.140598,0.002420,train
3738,MHO,2014-01-01,2013-10-24,CC,2013-11-06,CCC,2014-07-31,CC,2013-05-07,2014-07-31,...,152.0,294.0,166.0,4.516129,6.771084,3.039474,1.771084,-0.029287,0.003039,train
3739,MHO,2014-04-01,2014-01-29,CC,2013-11-06,CCC,2014-07-31,CC,2013-05-07,2014-07-31,...,136.0,325.0,158.0,3.835821,7.078947,3.264706,2.056962,0.084888,0.002848,test
3740,MHO,2014-07-01,2014-04-24,CC,2013-11-06,CCC,2014-07-31,CC,2013-05-07,2014-07-31,...,127.0,194.0,103.0,3.703704,6.944444,2.590551,1.883495,-0.319794,0.001551,test


## Fragment and store on GitHub

In [50]:
def split_df(df, dataset_name, out_folder, num_pieces):
    '''
    Splits dataframes into num_pieces and saves them as parquet files in out_folder. Reduces file size to comply with GitHub limits.
    '''
    # Tracking total length of pieces
    total_len_pieces = 0
    # Create out_folder if it does not exist
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)
    # Delete previous pieces, all contents of out_folder
    for filename in os.listdir(out_folder):
        file_path = os.path.join(out_folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    # Save pieces
    for i in range(num_pieces):
        # start index for piece rows
        start_index = i * len(df) // num_pieces
        # end index for piece rows
        end_index = (i + 1) * len(df) // num_pieces
        # get piece
        piece = df[start_index:end_index]
        piece.to_parquet(out_folder + '/' + dataset_name + '_piece_' + str(i) + '.parquet', index=False)
        #print(len(piece))
        total_len_pieces += len(piece)
    # check total piece length and length of vit_embeddings_df
    print('length check passed')
    print(total_len_pieces == len(df))

In [51]:
split_df(all_data_with_NLP_features, 'All_Data_with_NLP_Features', '../../../Data/All_Data/All_Data_with_NLP_Features', 10)

length check passed
True
