# Create Combined All Data - Fixed Quarter Dates

1. credit_ratings_on_fixed_quarter_dates_with_earnings_call_date.csv  
        * Columns: rating, symbol, rating agency, rating_date, fixed_quarter_date, ...  
        * Size: 7981 × 16  
        * 587 unique companies
2. combined_calls.csv  
        * Columns: symbol, quarter', year, earnings_call_datetime, content, source, web, earnings_call_date  
        * Size: 31067 x 8  
        * 1646 unique companies  
3. combined_corrected_tabular_financial_statements_data.parquet  
        * Columns: date, symbol, reportedCurrency, period, filing_date, financial variables, Altman_Z ...  
        * Size: 20825 x 134  
        * 796 unique companies  

In [73]:
# Whether this is a sample/debug run or not
DEBUG = False

In [74]:
# Packages
import pandas as pd
import openpyxl

In [75]:
# Load ~\Box\STAT 222 Capstone\Intermediate Data\Credit_Rating\credit_ratings_on_fixed_quarter_dates_with_earnings_call_date.csv
cr_and_dates = pd.read_csv(r'~\Box\STAT 222 Capstone\Intermediate Data\Credit_Rating\credit_ratings_on_fixed_quarter_dates_with_earnings_call_date.csv')

In [76]:
# drop source column (about to get it again via the next join)
cr_and_dates = cr_and_dates.drop(columns = ['source'])

In [77]:
# check for duplicates on ticker by fixed_quarter_date
print(cr_and_dates[cr_and_dates.duplicated(subset=['ticker', 'fixed_quarter_date'], keep=False)])

Empty DataFrame
Columns: [ticker, fixed_quarter_date, earnings_call_date, Rating, Rating Agency Name, rating_date, Source, Rating Rank AAA is 10, Next Rating, Next Rating Date, Previous Rating, Previous Rating Date, next_rating_date_or_end_of_data, credit_rating_year, previous_fixed_quarter_date, days_since_call]
Index: []


In [78]:
# Load ~\Box\STAT 222 Capstone\Intermediate Data\Calls\calls.csv
if DEBUG:
    calls = pd.read_parquet(r'~\Box\STAT 222 Capstone\Intermediate Data\Calls\combined_calls.parquet', nrows = 10)
else:
    calls = pd.read_parquet(r'~\Box\STAT 222 Capstone\Intermediate Data\Calls\combined_calls.parquet')

In [79]:
# check for duplicates on company by date
print(calls[calls.duplicated(subset=['symbol', 'earnings_call_date'], keep=False)])

Empty DataFrame
Columns: [symbol, quarter, year, earnings_call_datetime, content, source, web, earnings_call_date]
Index: []


# Earnings Call - Credit Score Merge

In [80]:
display(calls.columns)
display(cr_and_dates.columns)

Index(['symbol', 'quarter', 'year', 'earnings_call_datetime', 'content',
       'source', 'web', 'earnings_call_date'],
      dtype='object')

Index(['ticker', 'fixed_quarter_date', 'earnings_call_date', 'Rating',
       'Rating Agency Name', 'rating_date', 'Source', 'Rating Rank AAA is 10',
       'Next Rating', 'Next Rating Date', 'Previous Rating',
       'Previous Rating Date', 'next_rating_date_or_end_of_data',
       'credit_rating_year', 'previous_fixed_quarter_date', 'days_since_call'],
      dtype='object')

In [81]:
# convert earnings_call_date to date part
cr_and_dates['earnings_call_date'] = pd.to_datetime(cr_and_dates['earnings_call_date']).dt.date
# drop earnings_call_datetime and web (identical as earnings_call_date and source)
calls.drop(["earnings_call_datetime", "web"], axis = 1, inplace=True)
# rename days_since_call and source 
cr_and_dates.rename(columns = {'days_since_call':'days_since_call_on_fixed_quarter', 'Source':'CR_source'}, inplace=True)
# rename symbol, source, and content
calls.rename(columns = {'symbol': 'ticker', "source": "Calls_source", "content": "transcript"}, inplace=True)

In [82]:
# print remaining columns
print(calls.columns)
print(calls.shape)
print(cr_and_dates.columns)
print(cr_and_dates.shape)

Index(['ticker', 'quarter', 'year', 'transcript', 'Calls_source',
       'earnings_call_date'],
      dtype='object')
(31067, 6)
Index(['ticker', 'fixed_quarter_date', 'earnings_call_date', 'Rating',
       'Rating Agency Name', 'rating_date', 'CR_source',
       'Rating Rank AAA is 10', 'Next Rating', 'Next Rating Date',
       'Previous Rating', 'Previous Rating Date',
       'next_rating_date_or_end_of_data', 'credit_rating_year',
       'previous_fixed_quarter_date', 'days_since_call_on_fixed_quarter'],
      dtype='object')
(7981, 16)


In [83]:
# Inner join cr_and_dates (on columns ticker, earnings_call_date) with calls.csv (on columns company, date)
merged = pd.merge(cr_and_dates, calls, left_on=['ticker', 'earnings_call_date'], right_on=['ticker', 'earnings_call_date'], how='inner')

# Range of date, year, credit_rating_year
for variable in ['earnings_call_date','year', 'credit_rating_year']:
    print(variable)
    print(merged[variable].min())
    print(merged[variable].max())
    print()

# Free up memory from calls
del calls
del cr_and_dates
merged.head()

earnings_call_date
2010-05-03
2016-09-30

year
2010
2016

credit_rating_year
2010
2016



Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,next_rating_date_or_end_of_data,credit_rating_year,previous_fixed_quarter_date,days_since_call_on_fixed_quarter,quarter,year,transcript,Calls_source
0,AAPL,2014-07-01 00:00:00.000000,2014-04-23,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,AAA,2014-04-24,2015-02-18,2014,2014-04-01 00:00:00.000000,69,2,2014,"Operator: Good day, everyone, and welcome to t...",web
1,AAPL,2014-10-01 00:00:00.000000,2014-07-22,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,AAA,2014-04-24,2015-02-18,2014,2014-07-01 00:00:00.000000,71,3,2014,"Operator: Good day, everyone, and welcome to t...",web
2,AAPL,2015-01-01 00:00:00.000000,2014-10-20,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,AAA,2014-04-24,2015-02-18,2014,2014-10-01 00:00:00.000000,73,4,2014,"Operator: Good day, everyone, and welcome to t...",web
3,AAPL,2015-04-01 00:00:00.000000,2015-01-27,AA,Standard & Poor's Ratings Services,2015-02-18,Supplementary,9,AA,2015-05-28,AA,2014-05-27,2015-05-28,2015,2015-01-01 00:00:00.000000,64,1,2015,Operator: Good day ladies and gentlemen and we...,web
4,AAPL,2015-07-01 00:00:00.000000,2015-04-27,AA,Standard & Poor's Ratings Services,2015-06-02,Supplementary,9,AA,2015-08-25,AA,2015-05-28,2015-08-25,2015,2015-04-01 00:00:00.000000,65,2,2015,"Operator: Good day everyone, and welcome to th...",web


In [84]:
# Keep only the date 
merged["fixed_quarter_date"] = pd.to_datetime(merged["fixed_quarter_date"]).dt.date
merged["previous_fixed_quarter_date"] = pd.to_datetime(merged["previous_fixed_quarter_date"]).dt.date
merged["rating_date"] = pd.to_datetime(merged["rating_date"]).dt.date

In [85]:
# check for duplicates on ticker by fixed_quarter_date
print(merged[merged.duplicated(subset=['ticker', 'fixed_quarter_date'], keep=False)])

Empty DataFrame
Columns: [ticker, fixed_quarter_date, earnings_call_date, Rating, Rating Agency Name, rating_date, CR_source, Rating Rank AAA is 10, Next Rating, Next Rating Date, Previous Rating, Previous Rating Date, next_rating_date_or_end_of_data, credit_rating_year, previous_fixed_quarter_date, days_since_call_on_fixed_quarter, quarter, year, transcript, Calls_source]
Index: []


In [86]:
# check values of source
print(merged['Calls_source'].value_counts())

Calls_source
web    7981
Name: count, dtype: int64


In [87]:
print(merged.shape)
print(merged.columns)

(7981, 20)
Index(['ticker', 'fixed_quarter_date', 'earnings_call_date', 'Rating',
       'Rating Agency Name', 'rating_date', 'CR_source',
       'Rating Rank AAA is 10', 'Next Rating', 'Next Rating Date',
       'Previous Rating', 'Previous Rating Date',
       'next_rating_date_or_end_of_data', 'credit_rating_year',
       'previous_fixed_quarter_date', 'days_since_call_on_fixed_quarter',
       'quarter', 'year', 'transcript', 'Calls_source'],
      dtype='object')


In [88]:
print("number of fix_quarter_date = rating_date:", sum(merged.fixed_quarter_date == merged.rating_date))
print("number of fix_quarter_date after rating_date:", sum(merged.fixed_quarter_date < merged.rating_date))
print("number of earnings_call_date = rating_date:", sum(merged.earnings_call_date == merged.rating_date))
print("number of earnings_call_date after rating_date:", sum(merged.earnings_call_date < merged.rating_date))

number of fix_quarter_date = rating_date: 26
number of fix_quarter_date after rating_date: 0
number of earnings_call_date = rating_date: 47
number of earnings_call_date after rating_date: 1615


In [89]:
merged.days_since_call_on_fixed_quarter.describe()

count    7981.000000
mean       57.050996
std        14.749327
min         0.000000
25%        54.000000
50%        61.000000
75%        66.000000
max        91.000000
Name: days_since_call_on_fixed_quarter, dtype: float64

In [90]:
merged.rename(columns={"year": "calls_year"}, inplace=True)

In [91]:
merged[merged.calls_year != merged.credit_rating_year].head(10)

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,next_rating_date_or_end_of_data,credit_rating_year,previous_fixed_quarter_date,days_since_call_on_fixed_quarter,quarter,calls_year,transcript,Calls_source
7,AAPL,2016-04-01,2016-01-26,AA,Standard & Poor's Ratings Services,2015-08-25,Supplementary,9,AA,2016-05-20,AA,2015-06-02,2016-05-20,2015,2016-01-01,66,1,2016,"Operator: Good day, ladies and gentlemen, and ...",web
13,ABB,2013-07-01,2013-04-24,A,Standard & Poor's Ratings Services,2012-05-17,Supplementary,8,,,BBB,2012-02-01,2016-12-31,2012,2013-04-01,68,1,2013,"Operator: Ladies and gentlemen, good morning o...",web
14,ABB,2013-10-01,2013-07-25,A,Standard & Poor's Ratings Services,2012-05-17,Supplementary,8,,,BBB,2012-02-01,2016-12-31,2012,2013-07-01,68,2,2013,"Operator: Ladies and gentlemen, good morning o...",web
15,ABB,2014-01-01,2013-10-24,A,Standard & Poor's Ratings Services,2012-05-17,Supplementary,8,,,BBB,2012-02-01,2016-12-31,2012,2013-10-01,69,3,2013,"Operator: Ladies and gentlemen, good morning o...",web
16,ABB,2014-04-01,2014-02-13,A,Standard & Poor's Ratings Services,2012-05-17,Supplementary,8,,,BBB,2012-02-01,2016-12-31,2012,2014-01-01,47,4,2013,operator and the maintenance leader wants he c...,web
17,ABB,2014-07-01,2014-04-29,A,Standard & Poor's Ratings Services,2012-05-17,Supplementary,8,,,BBB,2012-02-01,2016-12-31,2012,2014-04-01,63,1,2014,"Operator: Ladies and gentlemen, good morning, ...",web
18,ABB,2014-10-01,2014-07-23,A,Standard & Poor's Ratings Services,2012-05-17,Supplementary,8,,,BBB,2012-02-01,2016-12-31,2012,2014-07-01,70,2,2014,"Operator: Ladies and gentlemen, good morning o...",web
19,ABB,2015-01-01,2014-10-22,A,Standard & Poor's Ratings Services,2012-05-17,Supplementary,8,,,BBB,2012-02-01,2016-12-31,2012,2014-10-01,71,3,2014,"Operator: Ladies and gentlemen, good morning o...",web
20,ABB,2015-04-01,2015-02-05,A,Standard & Poor's Ratings Services,2012-05-17,Supplementary,8,,,BBB,2012-02-01,2016-12-31,2012,2015-01-01,55,4,2014,Operator: First question is from Mr. Ben Uglow...,web
21,ABB,2015-07-01,2015-04-29,A,Standard & Poor's Ratings Services,2012-05-17,Supplementary,8,,,BBB,2012-02-01,2016-12-31,2012,2015-04-01,63,1,2015,"Operator: Ladies and gentlemen, good morning o...",web


In [92]:
merged.columns

Index(['ticker', 'fixed_quarter_date', 'earnings_call_date', 'Rating',
       'Rating Agency Name', 'rating_date', 'CR_source',
       'Rating Rank AAA is 10', 'Next Rating', 'Next Rating Date',
       'Previous Rating', 'Previous Rating Date',
       'next_rating_date_or_end_of_data', 'credit_rating_year',
       'previous_fixed_quarter_date', 'days_since_call_on_fixed_quarter',
       'quarter', 'calls_year', 'transcript', 'Calls_source'],
      dtype='object')

### Merge Financial Statements Data to CR-Calls data

In [93]:
# Load tabular financial statement data
df = pd.read_parquet(r'~\Box\STAT 222 Capstone\Intermediate Data\Tabular_Fin\combined_corrected_tabular_financial_statements_data.parquet')

# Perform merge
merged = pd.merge(merged, df, left_on=['ticker', 'calls_year', 'quarter'], right_on=['symbol', 'calendarYear', 'period'], how='inner')    
display(merged.head())

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,...,marketCap,EBIT,common_plus_preferred_stock,workingCapital,Ratio_A,Ratio_B,Ratio_C,Ratio_D,Ratio_E,Altman_Z
0,AAPL,2014-07-01,2014-04-23,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,...,469534600000.0,13593000.0,21496000.0,27333000.0,0.065989,0.221594,8.770191,0.132692,0.480288,6.530889
1,AAPL,2014-10-01,2014-07-22,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,...,553162400.0,10282000.0,22139000.0,21744000.0,0.046207,0.168219,5.445584,0.097717,0.443623,4.324703
2,AAPL,2015-01-01,2014-10-20,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,...,597894200000.0,11165000.0,23313000.0,5083000.0,0.048158,0.181691,8.770191,0.021925,0.375916,6.153503
3,AAPL,2015-04-01,2015-01-27,AA,Standard & Poor's Ratings Services,2015-02-18,Supplementary,9,AA,2015-05-28,...,666111300.0,24246000.0,24187000.0,9792000.0,0.077155,0.284844,4.807177,0.037389,0.371059,3.985264
4,AAPL,2015-07-01,2015-04-27,AA,Standard & Poor's Ratings Services,2015-06-02,Supplementary,9,AA,2015-08-25,...,714027800000.0,18278000.0,25376000.0,9162000.0,0.069979,0.222095,8.770191,0.035077,0.386379,6.295942


In [94]:
# check for duplicates on ticker by fixed_quarter_date
print(merged[merged.duplicated(subset=['ticker', 'fixed_quarter_date'], keep=False)])

Empty DataFrame
Columns: [ticker, fixed_quarter_date, earnings_call_date, Rating, Rating Agency Name, rating_date, CR_source, Rating Rank AAA is 10, Next Rating, Next Rating Date, Previous Rating, Previous Rating Date, next_rating_date_or_end_of_data, credit_rating_year, previous_fixed_quarter_date, days_since_call_on_fixed_quarter, quarter, calls_year, transcript, Calls_source, date, symbol, reportedCurrency, cik, fillingDate, acceptedDate, calendarYear, period, cashAndCashEquivalents, shortTermInvestments, cashAndShortTermInvestments, netReceivables, inventory, otherCurrentAssets, totalCurrentAssets, propertyPlantEquipmentNet, goodwill, intangibleAssets, goodwillAndIntangibleAssets, longTermInvestments, taxAssets, otherNonCurrentAssets, totalNonCurrentAssets, otherAssets, totalAssets, accountPayables, shortTermDebt, taxPayables, deferredRevenue, otherCurrentLiabilities, totalCurrentLiabilities, longTermDebt, deferredRevenueNonCurrent, deferredTaxLiabilitiesNonCurrent, otherNonCurrent

### New Columns: Rating Changes

In [95]:
# Variable for rating on previous fixed_quarter_date
# Sort by ticker, fixed_quarter_date
merged = merged.sort_values(by=['ticker', 'fixed_quarter_date'])
# Shift the rating by 1
merged['rating_on_previous_fixed_quarter_date'] = merged.groupby('ticker')['Rating'].shift(1)
merged[['ticker', 'fixed_quarter_date', 'Rating', 'rating_on_previous_fixed_quarter_date']].head(10)

Unnamed: 0,ticker,fixed_quarter_date,Rating,rating_on_previous_fixed_quarter_date
0,AAPL,2014-07-01,AA,
1,AAPL,2014-10-01,AA,AA
2,AAPL,2015-01-01,AA,AA
3,AAPL,2015-04-01,AA,AA
4,AAPL,2015-07-01,AA,AA
5,AAPL,2015-10-01,AA,AA
6,AAPL,2016-01-01,AA,AA
7,AAPL,2016-04-01,AA,AA
8,AAPL,2016-07-01,AA,AA
9,AAPL,2016-10-01,AA,AA


In [96]:
# Rating change variables
# Get numeric values for Rating and rating_on_next_earnings_call_date
# Encode Ratings as ordinal
rating_order = ['AAA', 'AA', 'A', 'BBB', 'BB', 'B', 'CCC', 'CC', 'C', 'D']
# Reverse
rating_order = rating_order[::-1]
# Create dictionary mapping to number values
rating_dict = {rating: i for i, rating in enumerate(rating_order, 1)}
# Create new column "Rating Rank AAA is 10"
merged['Rating Rank AAA is 10'] = merged['Rating'].map(rating_dict)
# Flag for investment grade - Rating Rank >= 7.0
merged['Investment_Grade'] = merged['Rating Rank AAA is 10'] >= 7.0
# Create new column "rating_on_previous_fixed_quarter_date AAA is 10"
merged['rating_on_previous_fixed_quarter_date AAA is 10'] = merged['rating_on_previous_fixed_quarter_date'].map(rating_dict)

# Cross tab Rating and 'Rating Rank AAA is 10'
display(pd.crosstab(merged['Rating'], merged['Rating Rank AAA is 10']))
# Cross tab rating_on_previous_fixed_quarter_date and 'rating_on_previous_fixed_quarter_date AAA is 10'
display(pd.crosstab(merged['rating_on_previous_fixed_quarter_date'], merged['rating_on_previous_fixed_quarter_date AAA is 10']))

Rating Rank AAA is 10,1,2,3,4,5,6,7,8,9,10
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,0,0,0,0,0,0,0,1389,0,0
AA,0,0,0,0,0,0,0,0,339,0
AAA,0,0,0,0,0,0,0,0,0,137
B,0,0,0,0,1142,0,0,0,0,0
BB,0,0,0,0,0,1870,0,0,0,0
BBB,0,0,0,0,0,0,2233,0,0,0
C,0,16,0,0,0,0,0,0,0,0
CC,0,0,14,0,0,0,0,0,0,0
CCC,0,0,0,185,0,0,0,0,0,0
D,9,0,0,0,0,0,0,0,0,0


rating_on_previous_fixed_quarter_date AAA is 10,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0
rating_on_previous_fixed_quarter_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,0,0,0,0,0,0,0,1303,0,0
AA,0,0,0,0,0,0,0,0,313,0
AAA,0,0,0,0,0,0,0,0,0,128
B,0,0,0,0,1044,0,0,0,0,0
BB,0,0,0,0,0,1719,0,0,0,0
BBB,0,0,0,0,0,0,2083,0,0,0
C,0,15,0,0,0,0,0,0,0,0
CC,0,0,13,0,0,0,0,0,0,0
CCC,0,0,0,172,0,0,0,0,0,0
D,8,0,0,0,0,0,0,0,0,0


In [97]:
# Variable for change before next call
def change(row):
    if row['Rating Rank AAA is 10'] < row['rating_on_previous_fixed_quarter_date AAA is 10']:
        return 'Downgrade Since Last Fixed Quarter Date'
    elif row['Rating Rank AAA is 10'] > row['rating_on_previous_fixed_quarter_date AAA is 10']:
        return 'Upgrade Since Last Fixed Quarter Date'
    elif row['Rating Rank AAA is 10'] == row['rating_on_previous_fixed_quarter_date AAA is 10']:
        return 'Same As Last Fixed Quarter Date'
    else:
        return None

# Apply function to create new column
merged['Change Direction Since Last Fixed Quarter Date'] = merged.apply(change, axis = 1)

# Also numeric version 'Change Since Last Fixed Quarter Date'
merged['Change Since Last Fixed Quarter Date'] = merged['Rating Rank AAA is 10'] - merged['rating_on_previous_fixed_quarter_date AAA is 10']

# Cross tab 'Change Direction Since Last Fixed Quarter Date' and 'Change Since Last Fixed Quarter Date'
display(pd.crosstab(merged['Change Direction Since Last Fixed Quarter Date'], merged['Change Since Last Fixed Quarter Date']))

Change Since Last Fixed Quarter Date,-2.0,-1.0,0.0,1.0,2.0
Change Direction Since Last Fixed Quarter Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Downgrade Since Last Fixed Quarter Date,17,149,0,0,0
Same As Last Fixed Quarter Date,0,0,6433,0,0
Upgrade Since Last Fixed Quarter Date,0,0,0,180,19


### Sector and Other Textual Company Information

From Kaggle: https://www.kaggle.com/datasets/aramacus/usa-public-companies

Supplemented with manually looked up sector data for missing sectors

In [98]:
# Load CSV 'C:\Users\ijyli\Box\STAT 222 Capstone\Intermediate Data\Sectors\combined_sector_data.csv'
combined_sector_data = pd.read_csv(r'~\Box\STAT 222 Capstone\Intermediate Data\Sectors\combined_sector_data.csv')
combined_sector_data.head()

Unnamed: 0,Ticker,Description,Company Name,Sector,Industry Group,Industry,Sub-Industry,Comment
0,CTVA,"Corteva, Inc. is a global provider of seed and...",CORTEVA INC,Materials,Materials,Chemicals,Fertilizers & Agricultural Chemicals,"Producers of fertilizers, pesticides, potash o..."
1,ALCO,"Alico, Inc. is an agribusiness and land manage...",ALICO INC,Consumer Staples,"Food, Beverage & Tobacco",Food Products,Agricultural Products,Producers of agricultural products. Includes c...
2,LMNR,Limoneira Company is primarily an agribusiness...,LIMONEIRA CO,Consumer Staples,"Food, Beverage & Tobacco",Food Products,Agricultural Products,Producers of agricultural products. Includes c...
3,SANW,S&W Seed Company (S&W) is a multi-crop and mid...,S&W SEED CO,Consumer Staples,"Food, Beverage & Tobacco",Food Products,Agricultural Products,Producers of agricultural products. Includes c...
4,TRC,Tejon Ranch Co. is a diversified real estate d...,TEJON RANCH CO,Real Estate,Real Estate,Real Estate Management & Development,Diversified Real Estate Activities,Companies engaged in a diverse spectrum of rea...


In [99]:
combined_sector_data.shape

(3389, 8)

In [100]:
# Left join us_stocks with merged
merged = pd.merge(merged, combined_sector_data, left_on='ticker', right_on='Ticker', how='left')
merged

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,...,Change Direction Since Last Fixed Quarter Date,Change Since Last Fixed Quarter Date,Ticker,Description,Company Name,Sector,Industry Group,Industry,Sub-Industry,Comment
0,AAPL,2014-07-01,2014-04-23,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,...,,,AAPL,"Apple Inc. (Apple) designs, manufactures and m...",APPLE INC,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals","Manufacturers of cellular phones, personal com..."
1,AAPL,2014-10-01,2014-07-22,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,...,Same As Last Fixed Quarter Date,0.0,AAPL,"Apple Inc. (Apple) designs, manufactures and m...",APPLE INC,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals","Manufacturers of cellular phones, personal com..."
2,AAPL,2015-01-01,2014-10-20,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,...,Same As Last Fixed Quarter Date,0.0,AAPL,"Apple Inc. (Apple) designs, manufactures and m...",APPLE INC,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals","Manufacturers of cellular phones, personal com..."
3,AAPL,2015-04-01,2015-01-27,AA,Standard & Poor's Ratings Services,2015-02-18,Supplementary,9,AA,2015-05-28,...,Same As Last Fixed Quarter Date,0.0,AAPL,"Apple Inc. (Apple) designs, manufactures and m...",APPLE INC,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals","Manufacturers of cellular phones, personal com..."
4,AAPL,2015-07-01,2015-04-27,AA,Standard & Poor's Ratings Services,2015-06-02,Supplementary,9,AA,2015-08-25,...,Same As Last Fixed Quarter Date,0.0,AAPL,"Apple Inc. (Apple) designs, manufactures and m...",APPLE INC,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals","Manufacturers of cellular phones, personal com..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7329,ZTS,2015-10-01,2015-08-04,BBB,Standard & Poor's Ratings Services,2015-01-30,Both,7,BBB,2015-11-03,...,Same As Last Fixed Quarter Date,0.0,ZTS,"Zoetis Inc. is focused on the discovery, devel...",ZOETIS INC,Health Care,"Pharmaceuticals, Biotechnology & Life Sciences",Pharmaceuticals,Pharmaceuticals,"Companies engaged in the research, development..."
7330,ZTS,2016-01-01,2015-11-03,BBB,Standard & Poor's Ratings Services,2015-11-03,Supplementary,7,BBB,2016-01-22,...,Same As Last Fixed Quarter Date,0.0,ZTS,"Zoetis Inc. is focused on the discovery, devel...",ZOETIS INC,Health Care,"Pharmaceuticals, Biotechnology & Life Sciences",Pharmaceuticals,Pharmaceuticals,"Companies engaged in the research, development..."
7331,ZTS,2016-04-01,2016-02-16,BBB,Standard & Poor's Ratings Services,2016-01-22,Both,7,BBB,2016-12-23,...,Same As Last Fixed Quarter Date,0.0,ZTS,"Zoetis Inc. is focused on the discovery, devel...",ZOETIS INC,Health Care,"Pharmaceuticals, Biotechnology & Life Sciences",Pharmaceuticals,Pharmaceuticals,"Companies engaged in the research, development..."
7332,ZTS,2016-07-01,2016-05-04,BBB,Standard & Poor's Ratings Services,2016-01-22,Both,7,BBB,2016-12-23,...,Same As Last Fixed Quarter Date,0.0,ZTS,"Zoetis Inc. is focused on the discovery, devel...",ZOETIS INC,Health Care,"Pharmaceuticals, Biotechnology & Life Sciences",Pharmaceuticals,Pharmaceuticals,"Companies engaged in the research, development..."


In [101]:
# check for duplicates on ticker by fixed_quarter_date
print(merged[merged.duplicated(subset=['ticker', 'fixed_quarter_date'], keep=False)])

Empty DataFrame
Columns: [ticker, fixed_quarter_date, earnings_call_date, Rating, Rating Agency Name, rating_date, CR_source, Rating Rank AAA is 10, Next Rating, Next Rating Date, Previous Rating, Previous Rating Date, next_rating_date_or_end_of_data, credit_rating_year, previous_fixed_quarter_date, days_since_call_on_fixed_quarter, quarter, calls_year, transcript, Calls_source, date, symbol, reportedCurrency, cik, fillingDate, acceptedDate, calendarYear, period, cashAndCashEquivalents, shortTermInvestments, cashAndShortTermInvestments, netReceivables, inventory, otherCurrentAssets, totalCurrentAssets, propertyPlantEquipmentNet, goodwill, intangibleAssets, goodwillAndIntangibleAssets, longTermInvestments, taxAssets, otherNonCurrentAssets, totalNonCurrentAssets, otherAssets, totalAssets, accountPayables, shortTermDebt, taxPayables, deferredRevenue, otherCurrentLiabilities, totalCurrentLiabilities, longTermDebt, deferredRevenueNonCurrent, deferredTaxLiabilitiesNonCurrent, otherNonCurrent

In [102]:
merged.Sector.isnull().sum()

0

### Some Basic Stats

In [103]:
# Summarize columns
print(merged.info())
# print column names
for col in merged.columns:
    print(col)
print(merged.shape)
print(merged.ticker.nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7334 entries, 0 to 7333
Columns: 167 entries, ticker to Comment
dtypes: bool(1), datetime64[ns](1), float64(114), int32(1), int64(9), object(41)
memory usage: 9.3+ MB
None
ticker
fixed_quarter_date
earnings_call_date
Rating
Rating Agency Name
rating_date
CR_source
Rating Rank AAA is 10
Next Rating
Next Rating Date
Previous Rating
Previous Rating Date
next_rating_date_or_end_of_data
credit_rating_year
previous_fixed_quarter_date
days_since_call_on_fixed_quarter
quarter
calls_year
transcript
Calls_source
date
symbol
reportedCurrency
cik
fillingDate
acceptedDate
calendarYear
period
cashAndCashEquivalents
shortTermInvestments
cashAndShortTermInvestments
netReceivables
inventory
otherCurrentAssets
totalCurrentAssets
propertyPlantEquipmentNet
goodwill
intangibleAssets
goodwillAndIntangibleAssets
longTermInvestments
taxAssets
otherNonCurrentAssets
totalNonCurrentAssets
otherAssets
totalAssets
accountPayables
shortTermDebt
taxPayables
deferredR

In [104]:
print((merged.ticker == merged.Ticker).all())
merged.drop(columns=["Ticker"], inplace=True)
print(merged.shape)
print(merged.columns)

True
(7334, 166)
Index(['ticker', 'fixed_quarter_date', 'earnings_call_date', 'Rating',
       'Rating Agency Name', 'rating_date', 'CR_source',
       'Rating Rank AAA is 10', 'Next Rating', 'Next Rating Date',
       ...
       'rating_on_previous_fixed_quarter_date AAA is 10',
       'Change Direction Since Last Fixed Quarter Date',
       'Change Since Last Fixed Quarter Date', 'Description', 'Company Name',
       'Sector', 'Industry Group', 'Industry', 'Sub-Industry', 'Comment'],
      dtype='object', length=166)


## 80-20 Train Test Split

In [105]:
# Define share of data to be used for training
share_train = 0.8

In [106]:
# Set seed to 222
import random
random.seed(222)
# Create column train_test_80_20 with 80% probability of train
# Create list of random uniforms of length of merged
rand_uniforms = [random.uniform(0, 1) for _ in range(merged.shape[0])]
# Train and test based of of uniforms
merged['train_test_80_20'] = ['train' if x < share_train else 'test' for x in rand_uniforms]
# Print ticker and fixed_quarter_date 10 train items
print(merged[merged['train_test_80_20'] == 'train'][['ticker', 'fixed_quarter_date']].head(10))
# Print value counts
print(merged['train_test_80_20'].value_counts())

   ticker fixed_quarter_date
0    AAPL         2014-07-01
1    AAPL         2014-10-01
2    AAPL         2015-01-01
3    AAPL         2015-04-01
4    AAPL         2015-07-01
5    AAPL         2015-10-01
6    AAPL         2016-01-01
7    AAPL         2016-04-01
8    AAPL         2016-07-01
10    ABB         2012-10-01
train_test_80_20
train    5933
test     1401
Name: count, dtype: int64


## Exporting Data

In [107]:
# Save a parquet file as well
merged.to_parquet(r'~\Box\STAT 222 Capstone\Intermediate Data\All_Data\all_data_fixed_quarter_dates.parquet', index=False)

In [108]:
# Sample 100 rows to create all_data_sample.csv
if not DEBUG:
    all_data_sample = merged.sample(100)
    all_data_sample.to_csv(r'~\Box\STAT 222 Capstone\Intermediate Data\All_Data\all_data_fixed_quarter_dates_sample.csv', index=False)
    # also save to xlsx
    all_data_sample.to_excel(r'~\Box\STAT 222 Capstone\Intermediate Data\All_Data\all_data_fixed_quarter_dates_sample.xlsx', index=False)

## Fragmented Version to Store on GitHub

In [109]:
# Split dataset into pieces
num_pieces = 8
total_len_pieces = 0
# Delete previous pieces, all contents of '../../../Data/All_Data/All_Data_Fixed_Quarter_Dates'
import os
import shutil
folder = r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates'
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print('Failed to delete %s. Reason: %s' % (file_path, e))
# Save pieces
if not DEBUG:
    for i in range(num_pieces):
        # start index for piece rows
        start_index = i * len(merged) // num_pieces
        # end index for piece rows
        end_index = (i + 1) * len(merged) // num_pieces
        # get piece
        piece = merged[start_index:end_index]
        piece.to_parquet(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates/all_data_fixed_quarter_dates_piece_' + str(i) + '.parquet', index=False)
        display(piece.head(3))
        print(len(piece))
        total_len_pieces += len(piece)
    # save 100 row sample
    merged.sample(100).to_parquet(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates_Sample/all_data_fixed_quarter_dates_sample.parquet', index=False)


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,...,Change Direction Since Last Fixed Quarter Date,Change Since Last Fixed Quarter Date,Description,Company Name,Sector,Industry Group,Industry,Sub-Industry,Comment,train_test_80_20
0,AAPL,2014-07-01,2014-04-23,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,...,,,"Apple Inc. (Apple) designs, manufactures and m...",APPLE INC,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals","Manufacturers of cellular phones, personal com...",train
1,AAPL,2014-10-01,2014-07-22,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,...,Same As Last Fixed Quarter Date,0.0,"Apple Inc. (Apple) designs, manufactures and m...",APPLE INC,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals","Manufacturers of cellular phones, personal com...",train
2,AAPL,2015-01-01,2014-10-20,AA,Standard & Poor's Ratings Services,2014-05-27,Supplementary,9,AA,2015-02-18,...,Same As Last Fixed Quarter Date,0.0,"Apple Inc. (Apple) designs, manufactures and m...",APPLE INC,Information Technology,Technology Hardware & Equipment,"Technology Hardware, Storage & Peripherals","Technology Hardware, Storage & Peripherals","Manufacturers of cellular phones, personal com...",train


916


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,...,Change Direction Since Last Fixed Quarter Date,Change Since Last Fixed Quarter Date,Description,Company Name,Sector,Industry Group,Industry,Sub-Industry,Comment,train_test_80_20
916,BWA,2011-10-01,2011-07-28,BBB,Standard & Poor's Ratings Services,2011-03-29,Supplementary,7,BBB,2012-04-10,...,,,BorgWarner Inc. is a provider of technology so...,BORGWARNER INC,Consumer Discretionary,Automobiles & Components,Auto Components,Auto Parts & Equipment,Manufacturers of parts and accessories for au...,train
917,BWA,2012-01-01,2011-10-28,BBB,Standard & Poor's Ratings Services,2011-03-29,Supplementary,7,BBB,2012-04-10,...,Same As Last Fixed Quarter Date,0.0,BorgWarner Inc. is a provider of technology so...,BORGWARNER INC,Consumer Discretionary,Automobiles & Components,Auto Components,Auto Parts & Equipment,Manufacturers of parts and accessories for au...,train
918,BWA,2012-04-01,2012-02-14,BBB,Standard & Poor's Ratings Services,2011-03-29,Supplementary,7,BBB,2012-04-10,...,Same As Last Fixed Quarter Date,0.0,BorgWarner Inc. is a provider of technology so...,BORGWARNER INC,Consumer Discretionary,Automobiles & Components,Auto Components,Auto Parts & Equipment,Manufacturers of parts and accessories for au...,train


917


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,...,Change Direction Since Last Fixed Quarter Date,Change Since Last Fixed Quarter Date,Description,Company Name,Sector,Industry Group,Industry,Sub-Industry,Comment,train_test_80_20
1833,DUK,2014-04-01,2014-02-18,A,Standard & Poor's Ratings Services,2012-07-25,Supplementary,8,,,...,Same As Last Fixed Quarter Date,0.0,Duke Energy Corporation is an energy company. ...,DUKE ENERGY CORP,Utilities,Utilities,Electric Utilities,Electric Utilities,Companies that produce or distribute electrici...,test
1834,DUK,2014-07-01,2014-05-07,A,Standard & Poor's Ratings Services,2012-07-25,Supplementary,8,,,...,Same As Last Fixed Quarter Date,0.0,Duke Energy Corporation is an energy company. ...,DUKE ENERGY CORP,Utilities,Utilities,Electric Utilities,Electric Utilities,Companies that produce or distribute electrici...,train
1835,DUK,2014-10-01,2014-08-07,A,Standard & Poor's Ratings Services,2012-07-25,Supplementary,8,,,...,Same As Last Fixed Quarter Date,0.0,Duke Energy Corporation is an energy company. ...,DUKE ENERGY CORP,Utilities,Utilities,Electric Utilities,Electric Utilities,Companies that produce or distribute electrici...,train


917


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,...,Change Direction Since Last Fixed Quarter Date,Change Since Last Fixed Quarter Date,Description,Company Name,Sector,Industry Group,Industry,Sub-Industry,Comment,train_test_80_20
2750,GOLD,2011-10-01,2011-08-08,A,Standard & Poor's Ratings Services,2011-05-19,Supplementary,8,A,2012-03-30,...,Same As Last Fixed Quarter Date,0.0,,,Materials,,,,,train
2751,GOLD,2012-01-01,2011-11-02,A,Standard & Poor's Ratings Services,2011-05-19,Supplementary,8,A,2012-03-30,...,Same As Last Fixed Quarter Date,0.0,,,Materials,,,,,train
2752,GOLD,2012-04-01,2012-02-16,A,Standard & Poor's Ratings Services,2012-03-30,Supplementary,8,A,2012-05-31,...,Same As Last Fixed Quarter Date,0.0,,,Materials,,,,,test


917


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,...,Change Direction Since Last Fixed Quarter Date,Change Since Last Fixed Quarter Date,Description,Company Name,Sector,Industry Group,Industry,Sub-Industry,Comment,train_test_80_20
3667,KOP,2016-07-01,2016-05-06,B,Standard & Poor's Ratings Services,2015-01-26,Supplementary,5,,,...,Same As Last Fixed Quarter Date,0.0,Koppers Holdings Inc. is an integrated global ...,KOPPERS HOLDINGS INC,Materials,Materials,Chemicals,Commodity Chemicals,Companies that primarily produce industrial ch...,train
3668,KOP,2016-10-01,2016-08-04,B,Standard & Poor's Ratings Services,2015-01-26,Supplementary,5,,,...,Same As Last Fixed Quarter Date,0.0,Koppers Holdings Inc. is an integrated global ...,KOPPERS HOLDINGS INC,Materials,Materials,Chemicals,Commodity Chemicals,Companies that primarily produce industrial ch...,train
3669,KOS,2014-10-01,2014-08-04,CCC,Standard & Poor's Ratings Services,2014-07-21,Supplementary,4,CCC,2015-04-08,...,,,Kosmos Energy Ltd. is a full cycle deepwater i...,KOSMOS ENERGY LTD,Energy,Energy,"Oil, Gas & Consumable Fuels",Oil & Gas Exploration & Production,Companies engaged in the exploration and produ...,train


916


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,...,Change Direction Since Last Fixed Quarter Date,Change Since Last Fixed Quarter Date,Description,Company Name,Sector,Industry Group,Industry,Sub-Industry,Comment,train_test_80_20
4583,NEE,2011-04-01,2011-01-25,BBB,Standard & Poor's Ratings Services,2011-03-25,Supplementary,7,BBB,2012-03-27,...,,,"NextEra Energy, Inc. is an electric power and ...",NEXTERA ENERGY INC,Utilities,Utilities,Electric Utilities,Electric Utilities,Companies that produce or distribute electrici...,train
4584,NEE,2011-07-01,2011-04-29,BBB,Standard & Poor's Ratings Services,2011-03-25,Supplementary,7,BBB,2012-03-27,...,Same As Last Fixed Quarter Date,0.0,"NextEra Energy, Inc. is an electric power and ...",NEXTERA ENERGY INC,Utilities,Utilities,Electric Utilities,Electric Utilities,Companies that produce or distribute electrici...,test
4585,NEE,2011-10-01,2011-07-27,BBB,Standard & Poor's Ratings Services,2011-03-25,Supplementary,7,BBB,2012-03-27,...,Same As Last Fixed Quarter Date,0.0,"NextEra Energy, Inc. is an electric power and ...",NEXTERA ENERGY INC,Utilities,Utilities,Electric Utilities,Electric Utilities,Companies that produce or distribute electrici...,train


917


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,...,Change Direction Since Last Fixed Quarter Date,Change Since Last Fixed Quarter Date,Description,Company Name,Sector,Industry Group,Industry,Sub-Industry,Comment,train_test_80_20
5500,POR,2015-07-01,2015-04-28,A,Standard & Poor's Ratings Services,2014-07-24,Supplementary,8,A,2015-07-22,...,Same As Last Fixed Quarter Date,0.0,Portland General Electric Company (PGE) is a v...,PORTLAND GENERAL ELECTRIC CO,Utilities,Utilities,Electric Utilities,Electric Utilities,Companies that produce or distribute electrici...,train
5501,POR,2015-10-01,2015-07-28,A,Standard & Poor's Ratings Services,2015-07-22,Supplementary,8,A,2016-06-23,...,Same As Last Fixed Quarter Date,0.0,Portland General Electric Company (PGE) is a v...,PORTLAND GENERAL ELECTRIC CO,Utilities,Utilities,Electric Utilities,Electric Utilities,Companies that produce or distribute electrici...,train
5502,POR,2016-01-01,2015-10-27,A,Standard & Poor's Ratings Services,2015-07-22,Supplementary,8,A,2016-06-23,...,Same As Last Fixed Quarter Date,0.0,Portland General Electric Company (PGE) is a v...,PORTLAND GENERAL ELECTRIC CO,Utilities,Utilities,Electric Utilities,Electric Utilities,Companies that produce or distribute electrici...,train


917


Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,Rating Agency Name,rating_date,CR_source,Rating Rank AAA is 10,Next Rating,Next Rating Date,...,Change Direction Since Last Fixed Quarter Date,Change Since Last Fixed Quarter Date,Description,Company Name,Sector,Industry Group,Industry,Sub-Industry,Comment,train_test_80_20
6417,TDG,2015-10-01,2015-08-04,CCC,Standard & Poor's Ratings Services,2015-05-15,Supplementary,4,CCC,2016-04-06,...,Same As Last Fixed Quarter Date,0.0,"TransDigm Group Incorporated is a designer, pr...",TRANSDIGM GROUP INC,Industrials,Capital Goods,Aerospace & Defense,Aerospace & Defense,Manufacturers of civil or military aerospace a...,test
6418,TDG,2016-01-01,2015-11-12,CCC,Standard & Poor's Ratings Services,2015-05-15,Supplementary,4,CCC,2016-04-06,...,Same As Last Fixed Quarter Date,0.0,"TransDigm Group Incorporated is a designer, pr...",TRANSDIGM GROUP INC,Industrials,Capital Goods,Aerospace & Defense,Aerospace & Defense,Manufacturers of civil or military aerospace a...,train
6419,TDG,2016-04-01,2016-02-09,CCC,Standard & Poor's Ratings Services,2015-05-15,Supplementary,4,CCC,2016-04-06,...,Same As Last Fixed Quarter Date,0.0,"TransDigm Group Incorporated is a designer, pr...",TRANSDIGM GROUP INC,Industrials,Capital Goods,Aerospace & Defense,Aerospace & Defense,Manufacturers of civil or military aerospace a...,train


917


In [110]:
# check total piece length and length of merged
print(total_len_pieces)
print(len(merged))

7334
7334
