# Create Learnable Network

Limit the data to calls that can be used for training and testing the graph neural network.

In [15]:
# Packages
import pandas as pd
import os
target_column = 'Rating'

## Load Feature and Class Data

In [16]:
# Read the Parquet file into a DataFrame
# list of files in '../../../../Data/All_Data/All_Data_with_NLP_Features'
file_list = [f for f in os.listdir(r'../../../Data/All_Data/All_Data_with_NLP_Features') if f.endswith('.parquet')]
# read in all parquet files
feature_and_class_df = pd.concat([pd.read_parquet(r'../../../Data/All_Data/All_Data_with_NLP_Features/' + f) for f in file_list])
# Sort by ticker and fixed_quarter_date
feature_and_class_df = feature_and_class_df.sort_values(['ticker', 'fixed_quarter_date']).reset_index(drop=True)
# Create node as ticker + ' : ' + fixed_quarter_date
feature_and_class_df['node'] = feature_and_class_df['ticker'] + ' : ' + feature_and_class_df['fixed_quarter_date'].astype(str)
# Drop transcript, not needed
feature_and_class_df = feature_and_class_df.drop(columns=['transcript'])

feature_and_class_df

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,next_rating_date_or_end_of_data,...,Ovrst,Undrst,PN,SW,AP,OU,tone,num_q_by_len,train_test_80_20,node
0,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,364.0,131.0,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822,train,AAPL : 2014-10-01
1,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,465.0,152.0,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766,train,AAPL : 2015-01-01
2,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,AA,2015-05-28,AA,2014-05-27,2015-05-28,...,468.0,151.0,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628,train,AAPL : 2015-04-01
3,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,AA,2015-08-25,AA,2015-05-28,2015-08-25,...,415.0,135.0,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861,train,AAPL : 2015-07-01
4,AAPL,2015-10-01,2015-07-21,AA,2015-08-25,AA,2016-05-20,AA,2015-06-02,2016-05-20,...,449.0,148.0,4.209877,10.442857,2.579909,3.033784,1.815531,0.003915,train,AAPL : 2015-10-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5504,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,BBB,2015-11-03,BBB,2014-01-31,2015-11-03,...,298.0,148.0,3.611650,15.634615,2.911215,2.013514,1.744657,0.001458,train,ZTS : 2015-10-01
5505,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,BBB,2016-01-22,BBB,2015-01-30,2016-01-22,...,395.0,222.0,3.766917,15.848101,2.791667,1.779279,1.596294,0.003859,train,ZTS : 2016-01-01
5506,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,BBB,2016-12-23,BBB,2015-11-03,2016-12-23,...,469.0,217.0,3.565517,17.506849,2.926829,2.161290,2.287146,0.003928,train,ZTS : 2016-04-01
5507,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,BBB,2016-12-23,BBB,2015-11-03,2016-12-23,...,449.0,215.0,3.572650,15.235294,3.023715,2.088372,1.739992,0.003182,train,ZTS : 2016-07-01


## Load Mentions Data

In [17]:
# Company mentions file
company_mentions_with_ticker = pd.read_excel('../../../Data/Company_Mentions/Company_Mentions_With_Ticker.xlsx')

# Get tickers and counts
src_dst_df = (company_mentions_with_ticker[['ticker', 'matched_ticker', 'fixed_quarter_date']]
                                            .drop_duplicates()
                                            .rename(columns={'ticker': 'src_ticker', 'matched_ticker': 'dst_ticker'}))

# Note: don't need to do anything to handle symmetry (don't need to drop half of the pairs)

src_dst_df

Unnamed: 0,src_ticker,dst_ticker,fixed_quarter_date
0,LPX,MCO,2016-04-01
1,LYV,AAPL,2015-07-01
2,LYV,AAPL,2016-01-01
3,LYV,AAPL,2016-04-01
4,LYV,CSCO,2016-07-01
...,...,...,...
3330,HOV,MTH,2015-10-01
3331,HOV,CNSL,2015-10-01
3332,KODK,AMZN,2015-10-01
3333,KODK,EPD,2016-04-01


In [18]:
# Add node identifier

# Convert fixed_quarter_date to a string for joining
src_dst_df['fixed_quarter_date'] = src_dst_df['fixed_quarter_date'].astype(str)
feature_and_class_df['fixed_quarter_date'] = feature_and_class_df['fixed_quarter_date'].astype(str)

# Join with feature_and_class_df to get node for src_ticker and dst_ticker
src_dst_df = src_dst_df.merge(feature_and_class_df[['ticker', 'fixed_quarter_date', 'node']], left_on=['src_ticker', 'fixed_quarter_date'], right_on=['ticker', 'fixed_quarter_date'], how='inner').rename(columns={'node': 'src_node'})
print('num obs')
print(len(src_dst_df))
src_dst_df = src_dst_df.merge(feature_and_class_df[['ticker', 'fixed_quarter_date', 'node']], left_on=['dst_ticker', 'fixed_quarter_date'], right_on=['ticker', 'fixed_quarter_date'], how='inner').rename(columns={'node': 'dst_node'})
print('num obs')
print(len(src_dst_df))

# Limit columns to just src_node and dst_node, rename to src and dst
src_dst_df = src_dst_df[['src_node', 'dst_node']].rename(columns={'src_node': 'src', 'dst_node': 'dst'})

src_dst_df

num obs
2750
num obs
1807


Unnamed: 0,src,dst
0,LPX : 2016-04-01,MCO : 2016-04-01
1,SBAC : 2016-04-01,MCO : 2016-04-01
2,AR : 2016-04-01,MCO : 2016-04-01
3,ATVI : 2016-04-01,MCO : 2016-04-01
4,DVN : 2016-04-01,MCO : 2016-04-01
...,...,...
1802,THC : 2014-10-01,ACIW : 2014-10-01
1803,HOV : 2013-07-01,CNSL : 2013-07-01
1804,HOV : 2014-04-01,TOL : 2014-04-01
1805,HOV : 2015-10-01,MTH : 2015-10-01


## Jointly Limit Feature and Class and Source - Destination Data

In [19]:
# In feature_and_class_df, value counts by train_test_80_20 and Rating
feature_and_class_df[['train_test_80_20', 'Rating']].value_counts().reset_index().sort_values(['Rating', 'train_test_80_20'])

Unnamed: 0,train_test_80_20,Rating,count
6,test,A,208
2,train,A,833
11,test,AA,52
7,train,AA,164
13,test,AAA,24
10,train,AAA,88
8,test,B,154
3,train,B,613
5,test,BB,284
1,train,BB,1173


In [20]:
# Limit feature_and_class_df to just items with a node in one of the src or dst columns
print('keeping only items that are connected/have a node in src or dst in src_dst_df')
print('length of feature_and_class_df')
print(len(feature_and_class_df))
feature_and_class_df = feature_and_class_df[feature_and_class_df['node'].isin(src_dst_df['src']) | feature_and_class_df['node'].isin(src_dst_df['dst'])]
print('new length of feature_and_class_df')
print(len(feature_and_class_df))

# Drop any items that belong to target_column values with only one training node
print('drop items in classes with only one training node')
print('length of feature_and_class_df')
print(len(feature_and_class_df))
feature_and_class_df['train_indicator'] = feature_and_class_df['train_test_80_20'].apply(lambda x: 1 if x == 'train' else 0)
feature_and_class_df['train_count'] = feature_and_class_df.groupby(target_column)['train_indicator'].transform('sum')
print('train_count value counts by rating')
print(feature_and_class_df[['Rating', 'train_count']].value_counts())
feature_and_class_df = feature_and_class_df[(feature_and_class_df['train_count'] > 1)]
feature_and_class_df = feature_and_class_df.drop(columns=['train_indicator', 'train_count'])
print('new length of feature_and_class_df')
print(len(feature_and_class_df))
print('new train test split')
print(feature_and_class_df['train_test_80_20'].value_counts())

# Limit src and dst to just nodes in feature_and_class_df
print('keeping only src and dst that are in feature_and_class_df')
print('length of src_dst_df')
print(len(src_dst_df))
src_dst_df = src_dst_df[src_dst_df['src'].isin(feature_and_class_df['node'])]
src_dst_df = src_dst_df[src_dst_df['dst'].isin(feature_and_class_df['node'])]
print('new length of src_dst_df')
print(len(src_dst_df))

# Again limit feature_and_class_df to just items with a node in one of the src or dst columns
print('keeping only items that are connected/have a node in src or dst in src_dst_df')
print('length of feature_and_class_df')
print(len(feature_and_class_df))
feature_and_class_df = feature_and_class_df[feature_and_class_df['node'].isin(src_dst_df['src']) | feature_and_class_df['node'].isin(src_dst_df['dst'])]
print('new length of feature_and_class_df')
print(len(feature_and_class_df))
print('new train test split')
print(feature_and_class_df['train_test_80_20'].value_counts())

keeping only items that are connected/have a node in src or dst in src_dst_df
length of feature_and_class_df
5509
new length of feature_and_class_df
1965
drop items in classes with only one training node
length of feature_and_class_df
1965
train_count value counts by rating
Rating  train_count
BBB     502            622
BB      365            455
A       350            437
B       190            241
AA      86             116
AAA     46              63
CCC     21              25
C       2                4
D       1                2
Name: count, dtype: int64
new length of feature_and_class_df
1963
new train test split
train_test_80_20
train    1562
test      401
Name: count, dtype: int64
keeping only src and dst that are in feature_and_class_df
length of src_dst_df
1807
new length of src_dst_df
1805
keeping only items that are connected/have a node in src or dst in src_dst_df
length of feature_and_class_df
1963
new length of feature_and_class_df
1963
new train test split
train_test_80_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_and_class_df['train_indicator'] = feature_and_class_df['train_test_80_20'].apply(lambda x: 1 if x == 'train' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feature_and_class_df['train_count'] = feature_and_class_df.groupby(target_column)['train_indicator'].transform('sum')


In [21]:
# In feature_and_class_df, value counts by train_test_80_20 and Rating
feature_and_class_df[['train_test_80_20', 'Rating']].value_counts().reset_index().sort_values(['Rating', 'train_test_80_20'])

Unnamed: 0,train_test_80_20,Rating,count
6,test,A,87
2,train,A,350
10,test,AA,30
7,train,AA,86
12,test,AAA,17
9,train,AAA,46
8,test,B,51
3,train,B,190
5,test,BB,90
1,train,BB,365


## Encode Node as Number

In [22]:
# List of all values of node in train_and_val_df and test_df
all_nodes = list(set(feature_and_class_df['node']))
# Encode as integers, create a mapping
node_to_int = {node: i for i, node in enumerate(all_nodes)}
# Add to feature_and_class_df as a replacement of node
feature_and_class_df['node'] = feature_and_class_df['node'].map(node_to_int)
# Same for src and dst
src_dst_df['src'] = src_dst_df['src'].map(node_to_int)
src_dst_df['dst'] = src_dst_df['dst'].map(node_to_int)
# Convert dictionary of node_to_int to df
node_to_int_df = pd.DataFrame(list(node_to_int.items()), columns=['node', 'node_int'])
# Save to disk
node_to_int_df.to_excel('../../../Data/Learnable Network/node_to_int.xlsx', index=False)

## Save Data

In [23]:
# Feature and class df
feature_and_class_df.to_excel('../../../Data/Learnable Network/feature_and_class_df.xlsx', index=False)

# src_dst_df
src_dst_df.to_excel('../../../Data/Learnable Network/src_dst_df.xlsx', index=False)

## Create special custom_mapping due to limited rating

In [24]:
uniq_ratings = feature_and_class_df['Rating'].unique()
ordering = ['AAA', 'AA', 'A', 'BBB', 'BB', 'B', 'CCC', 'CC', 'C', 'D']
# sorted uniq_ratings
sorted_uniq_ratings = sorted(uniq_ratings, key=lambda x: ordering.index(x))
print('sorted_uniq_ratings')
print(sorted_uniq_ratings)
# convert to enumerated dictionary
rating_to_int = {rating: i for i, rating in enumerate(sorted_uniq_ratings)}
print('rating_to_int')
print(rating_to_int)
# save to dictionary to disk
rating_to_int_df = pd.DataFrame(list(rating_to_int.items()), columns=['Rating', 'Rating_int'])
print('rating_to_int_df')
print(rating_to_int_df)
rating_to_int_df.to_excel('../../../Data/Learnable Network/rating_to_int.xlsx', index=False)

sorted_uniq_ratings
['AAA', 'AA', 'A', 'BBB', 'BB', 'B', 'CCC', 'C']
rating_to_int
{'AAA': 0, 'AA': 1, 'A': 2, 'BBB': 3, 'BB': 4, 'B': 5, 'CCC': 6, 'C': 7}
rating_to_int_df
  Rating  Rating_int
0    AAA           0
1     AA           1
2      A           2
3    BBB           3
4     BB           4
5      B           5
6    CCC           6
7      C           7
