# Train Graph NN on Call Mentions

Considering direct mentions of companies in calls, construct a network of calls

In [19]:
# Packages
from Graph_NN_Functions import *
model_name = 'exclude_previous_rating_model_1'

## Load Feature and Class Data

In [20]:
# Load feature and class data
feature_and_class_df = load_feature_and_class_data()
feature_and_class_df

Unnamed: 0,ticker,fixed_quarter_date,earnings_call_date,Rating,rating_date,Next Rating,Next Rating Date,Previous Rating,Previous Rating Date,next_rating_date_or_end_of_data,...,Ovrst,Undrst,PN,SW,AP,OU,tone,num_q_by_len,train_test_80_20,node
0,AAPL,2014-10-01,2014-07-22,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,364.0,131.0,5.518519,15.261905,2.661290,2.778626,3.188264,0.003822,train,AAPL : 2014-10-01
1,AAPL,2015-01-01,2014-10-20,AA,2014-05-27,AA,2015-02-18,AAA,2014-04-24,2015-02-18,...,465.0,152.0,5.348485,15.934783,3.296482,3.059211,3.681858,0.002766,train,AAPL : 2015-01-01
2,AAPL,2015-04-01,2015-01-27,AA,2015-02-18,AA,2015-05-28,AA,2014-05-27,2015-05-28,...,468.0,151.0,3.927711,8.113636,2.841346,3.099338,1.307366,0.004628,train,AAPL : 2015-04-01
3,AAPL,2015-07-01,2015-04-27,AA,2015-06-02,AA,2015-08-25,AA,2015-05-28,2015-08-25,...,415.0,135.0,5.250000,9.142857,2.640187,3.074074,2.025933,0.003861,train,AAPL : 2015-07-01
4,AAPL,2015-10-01,2015-07-21,AA,2015-08-25,AA,2016-05-20,AA,2015-06-02,2016-05-20,...,449.0,148.0,4.209877,10.442857,2.579909,3.033784,1.815531,0.003915,train,AAPL : 2015-10-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5504,ZTS,2015-10-01,2015-08-04,BBB,2015-01-30,BBB,2015-11-03,BBB,2014-01-31,2015-11-03,...,298.0,148.0,3.611650,15.634615,2.911215,2.013514,1.744657,0.001458,train,ZTS : 2015-10-01
5505,ZTS,2016-01-01,2015-11-03,BBB,2015-11-03,BBB,2016-01-22,BBB,2015-01-30,2016-01-22,...,395.0,222.0,3.766917,15.848101,2.791667,1.779279,1.596294,0.003859,train,ZTS : 2016-01-01
5506,ZTS,2016-04-01,2016-02-16,BBB,2016-01-22,BBB,2016-12-23,BBB,2015-11-03,2016-12-23,...,469.0,217.0,3.565517,17.506849,2.926829,2.161290,2.287146,0.003928,train,ZTS : 2016-04-01
5507,ZTS,2016-07-01,2016-05-04,BBB,2016-01-22,BBB,2016-12-23,BBB,2015-11-03,2016-12-23,...,449.0,215.0,3.572650,15.235294,3.023715,2.088372,1.739992,0.003182,train,ZTS : 2016-07-01


In [21]:
# Load column names
numeric_feature_columns, cat_feature_columns, target_column, custom_mapping = get_column_names_and_mapping(model_name)

In [22]:
# Prepare matrices
X_train_scaled, X_test_scaled, y_train, y_test, feature_names, train_ticker_by_fixed_quarter_date, test_ticker_by_fixed_quarter_date = prepare_matrices(feature_and_class_df, numeric_feature_columns, cat_feature_columns, target_column, custom_mapping)

feature names: 
['num__Altman_Z']


In [23]:
# Assemble back into dataframes

# Train and val
train_and_val_df = pd.DataFrame(X_train_scaled, columns=feature_names)
print(train_and_val_df)
# Add y_train
train_and_val_df[target_column] = y_train.reset_index(drop=True)
# Add ticker by fixed quarter date
train_and_val_df = pd.concat([train_ticker_by_fixed_quarter_date.reset_index(drop=True).sort_values(['ticker', 'fixed_quarter_date']), train_and_val_df], axis=1)
# Add node by merging with feature_and_class_df (inner join)
train_and_val_df = train_and_val_df.merge(feature_and_class_df[['ticker', 'fixed_quarter_date', 'node']], on=['ticker', 'fixed_quarter_date'], how='inner')
# Drop ticker and fixed_quarter_date
train_and_val_df = train_and_val_df.drop(['ticker', 'fixed_quarter_date'], axis=1)

# Test
test_df = pd.DataFrame(X_test_scaled, columns=feature_names)
# Add y_test
test_df[target_column] = y_test.reset_index(drop=True)
# Add ticker by fixed quarter date
test_df = pd.concat([test_ticker_by_fixed_quarter_date.reset_index(drop=True).sort_values(['ticker', 'fixed_quarter_date']), test_df], axis=1)
# Add node by merging with feature_and_class_df (inner join)
test_df = test_df.merge(feature_and_class_df[['ticker', 'fixed_quarter_date', 'node']], on=['ticker', 'fixed_quarter_date'], how='inner')
# Drop ticker and fixed_quarter_date
test_df = test_df.drop(['ticker', 'fixed_quarter_date'], axis=1)

print('finalized dfs')
print(train_and_val_df)
print(test_df)
print('missing values of target_column in train_and_val_df or test_df?')
print(train_and_val_df[target_column].isnull().sum() > 0)
print(test_df[target_column].isnull().sum() > 0)

      num__Altman_Z
0          1.901993
1          2.992192
2          1.643599
3          3.102926
4          2.982099
...             ...
4386       1.261070
4387       1.026472
4388       0.703264
4389       0.872260
4390       1.026999

[4391 rows x 1 columns]
finalized dfs
      num__Altman_Z  Rating               node
0          1.901993       1  AAPL : 2014-10-01
1          2.992192       1  AAPL : 2015-01-01
2          1.643599       1  AAPL : 2015-04-01
3          3.102926       1  AAPL : 2015-07-01
4          2.982099       1  AAPL : 2015-10-01
...             ...     ...                ...
4386       1.261070       3   ZTS : 2015-10-01
4387       1.026472       3   ZTS : 2016-01-01
4388       0.703264       3   ZTS : 2016-04-01
4389       0.872260       3   ZTS : 2016-07-01
4390       1.026999       3   ZTS : 2016-10-01

[4391 rows x 3 columns]
      num__Altman_Z  Rating               node
0          2.967006       1  AAPL : 2016-07-01
1          0.740850       2  ABBV : 20

## Load Pairwise Mentions Data

Note: it's OK if we lose observations here, because on some fixed quarter dates we don't have data for both companies in a mention link.

In [24]:
src_dst_df = load_src_dst_data()
print('num obs')
print(len(src_dst_df))
# Convert fixed_quarter_date to a string
src_dst_df['fixed_quarter_date'] = src_dst_df['fixed_quarter_date'].astype(str)
feature_and_class_df['fixed_quarter_date'] = feature_and_class_df['fixed_quarter_date'].astype(str)
# Join with feature_and_class_df to get node for src_ticker and dst_ticker
src_dst_df = src_dst_df.merge(feature_and_class_df[['ticker', 'fixed_quarter_date', 'node']], left_on=['src_ticker', 'fixed_quarter_date'], right_on=['ticker', 'fixed_quarter_date'], how='inner').rename(columns={'node': 'src_node'})
print('num obs')
print(len(src_dst_df))
src_dst_df = src_dst_df.merge(feature_and_class_df[['ticker', 'fixed_quarter_date', 'node']], left_on=['dst_ticker', 'fixed_quarter_date'], right_on=['ticker', 'fixed_quarter_date'], how='inner').rename(columns={'node': 'dst_node'})
print('num obs')
print(len(src_dst_df))
# Limit columns to just src_node and dst_node, rename to src and dst
src_dst_df = src_dst_df[['src_node', 'dst_node']].rename(columns={'src_node': 'src', 'dst_node': 'dst'})
src_dst_df

num obs
2750
num obs
2750
num obs
1807


Unnamed: 0,src,dst
0,LPX : 2016-04-01,MCO : 2016-04-01
1,SBAC : 2016-04-01,MCO : 2016-04-01
2,AR : 2016-04-01,MCO : 2016-04-01
3,ATVI : 2016-04-01,MCO : 2016-04-01
4,DVN : 2016-04-01,MCO : 2016-04-01
...,...,...
1802,THC : 2014-10-01,ACIW : 2014-10-01
1803,HOV : 2013-07-01,CNSL : 2013-07-01
1804,HOV : 2014-04-01,TOL : 2014-04-01
1805,HOV : 2015-10-01,MTH : 2015-10-01


## Edits to train and val and test dfs

In [25]:
# Limit train_and_val_df and test_df to just items with a node in one of the src or dst columns
print('keeping only items that are connected/have a node in src or dst in src_dst_df')
train_and_val_df = train_and_val_df[train_and_val_df['node'].isin(src_dst_df['src']) | train_and_val_df['node'].isin(src_dst_df['dst'])]
test_df = test_df[test_df['node'].isin(src_dst_df['src']) | test_df['node'].isin(src_dst_df['dst'])]

# Drop any items that belong to target_column values with only one node
print('drop items in classes with only one node')
print('length of train_and_val_df')
print(len(train_and_val_df))
train_and_val_df = train_and_val_df.groupby(target_column).filter(lambda x: len(x) > 1).reset_index(drop=True)
print('new length of train_and_val_df')
print(len(train_and_val_df))

# Limit src and dst to just nodes in train_and_val_df or test_df
print('keeping only src and dst that are in train_and_val_df or test_df')
print('length of src_dst_df')
print(len(src_dst_df))
src_dst_df = src_dst_df[src_dst_df['src'].isin(train_and_val_df['node']) | src_dst_df['src'].isin(test_df['node'])]
src_dst_df = src_dst_df[src_dst_df['dst'].isin(train_and_val_df['node']) | src_dst_df['dst'].isin(test_df['node'])]
print('new length of src_dst_df')
print(len(src_dst_df))

# Limit train_and_val_df and test_df to just items with a node in one of the src or dst columns
print('keeping only items that are connected/have a node in src or dst in src_dst_df again')
print('length of train_and_val_df')
print(len(train_and_val_df))
print('length of test_df')
print(len(test_df))
train_and_val_df = train_and_val_df[train_and_val_df['node'].isin(src_dst_df['src']) | train_and_val_df['node'].isin(src_dst_df['dst'])]
test_df = test_df[test_df['node'].isin(src_dst_df['src']) | test_df['node'].isin(src_dst_df['dst'])]
print('new length of train_and_val_df')
print(len(train_and_val_df))
print('new length of test_df')
print(len(test_df))

keeping only items that are connected/have a node in src or dst in src_dst_df
drop items in classes with only one node
length of train_and_val_df
1563
new length of train_and_val_df
1562
keeping only src and dst that are in train_and_val_df or test_df
length of src_dst_df
1807
new length of src_dst_df
1806
keeping only items that are connected/have a node in src or dst in src_dst_df again
length of train_and_val_df
1562
length of test_df
402
new length of train_and_val_df
1562
new length of test_df
402


## Encode Node as Number

In [26]:
# List of all values of node in train_and_val_df and test_df
all_nodes = list(set(train_and_val_df['node']) | set(test_df['node']))
# Encode as integers, create a mapping
node_to_int = {node: i for i, node in enumerate(all_nodes)}
# Add to train_and_val_df and test_df as a replacement of node
train_and_val_df['node'] = train_and_val_df['node'].map(node_to_int)
test_df['node'] = test_df['node'].map(node_to_int)
# Same for src and dst
src_dst_df['src'] = src_dst_df['src'].map(node_to_int)
src_dst_df['dst'] = src_dst_df['dst'].map(node_to_int)
# Convert dictionary of node_to_int to df
node_to_int_df = pd.DataFrame(list(node_to_int.items()), columns=['node', 'node_int'])
# Save to disk
node_to_int_df.to_excel('../../../Output/Modelling/Graph Neural Network/' + model_name + '/' + 'node_to_int.xlsx', index=False)

## Run the Inductive Model

In [27]:
run_model(train_and_val_df = train_and_val_df,
         test_df = test_df,
         src_dst_df = src_dst_df,
         model_dir = '../../../Output/Modelling/Graph Neural Network/' + model_name + '/',
         prediction_file_path = '../../../Data/Predictions/Graph Neural Network/' + model_name + '_predictions.xlsx',
         target_column = target_column,
         custom_mapping = custom_mapping,
         node_to_int = node_to_int,
         n_hidden = 32,
         n_layers = 2,
         dropout = 0.0,
         weight_decay = 5e-4,
         n_epochs = 100,
         lr = 0.01,
         aggregator_type = "pool",
         inductive = True)

Further slice the train dataset into train and validation datasets.
The training data has shape: (1249, 3).
The validation data has shape: (313, 3).
The test data has shape: (402, 3).
Generate train, validation, and test masks.
sum of train mask
tensor(1249)
sum of val mask
tensor(313)
sum of test mask
tensor(402)
Number of nodes = 1964
Number of features for each node = 1
Number of classes = 9.
Initializing Model
Initialized Model
NodeClassification(
  (gconv_model): GraphSAGEModel(
    (layers): ModuleList(
      (0): SAGEConv(
        (feat_drop): Dropout(p=0.0, inplace=False)
        (fc_pool): Linear(in_features=1, out_features=1, bias=True)
        (fc_neigh): Linear(in_features=1, out_features=32, bias=False)
        (fc_self): Linear(in_features=1, out_features=32, bias=True)
      )
      (1): SAGEConv(
        (feat_drop): Dropout(p=0.0, inplace=False)
        (fc_pool): Linear(in_features=32, out_features=32, bias=True)
        (fc_neigh): Linear(in_features=32, out_features