# Visualize Call Nodes Edges

Visualize network of company mentions

This notebook leaves things at the call level

We only consider direct connections for a pair of companies in a quarter

In [1]:
# Packages
import pandas as pd
import os
# Network stuff
import networkx as nx
from pyvis.network import Network

## Load Data

In [2]:
# Load '../../../Data/Company_Mentions/Company_Mentions_With_Ticker.xlsx'
company_mentions_with_ticker = pd.read_excel('../../../Data/Company_Mentions/Company_Mentions_With_Ticker.xlsx')
company_mentions_with_ticker

Unnamed: 0,ticker,fixed_quarter_date,company_mentioned,count,matched_ticker,Corporation_clean
0,NEE,2012-01-01,PTC,1,PTC,PTC
1,NEE,2012-04-01,MOODY'S,1,MCO,MOODY'S
2,NEE,2012-04-01,PTC,1,PTC,PTC
3,NEE,2012-07-01,ALLIANT,2,LNT,ALLIANT
4,NEE,2012-10-01,PTC,2,PTC,PTC
...,...,...,...,...,...,...
3300,KTOS,2014-10-01,NORTHROP GRUMMAN,1,NOC,NORTHROP GRUMMAN
3301,KTOS,2015-01-01,NORTHROP GRUMMAN,2,NOC,NORTHROP GRUMMAN
3302,KTOS,2015-07-01,GOOGLE,1,GOOG,GOOGLE
3303,KTOS,2015-10-01,GOOGLE,1,GOOG,GOOGLE


## Get pairwise tickers

In [3]:
pairwise_df = (company_mentions_with_ticker[['ticker', 'matched_ticker', 'fixed_quarter_date']].rename(columns={'ticker': 'ticker1', 'matched_ticker': 'ticker2'})
                                                                         .value_counts()
                                                                         .reset_index()
                                                                         .rename(columns={0: 'count'}))

pairwise_df

Unnamed: 0,ticker1,ticker2,fixed_quarter_date,count
0,NTAP,MSFT,2014-10-01,2
1,EQIX,ORCL,2015-01-01,2
2,EQIX,NVDA,2014-04-01,2
3,SWK,TGT,2016-01-01,2
4,STX,MSFT,2014-07-01,2
...,...,...,...,...
2720,HST,MCO,2013-04-01,1
2721,HSY,AMZN,2016-04-01,1
2722,HSY,MKC,2012-04-01,1
2723,HSY,MKC,2012-07-01,1


In [4]:
# Order doesn't matter!

# Iterate over rows, create sorted list of tickers
pairwise_df['sorted_tickers'] = pairwise_df[['ticker1', 'ticker2']].apply(lambda x: sorted(x), axis=1)

# Sort the rows by the sorted_tickers column
pairwise_df = pairwise_df.sort_values('sorted_tickers')

# Duplicates on sorted_tickers
#print(pairwise_df[pairwise_df.duplicated('sorted_tickers')])

# Split sorted tickers into two columns again
pairwise_df[['ticker1', 'ticker2']] = pd.DataFrame(pairwise_df['sorted_tickers'].tolist(), index=pairwise_df.index)

# Print duplicates on ticker1 and ticker2
#print(pairwise_df[pairwise_df.duplicated(['ticker1', 'ticker2'])])

# Collapse to sums of count by ticker1 and ticker2
pairwise_df = pairwise_df.groupby(['ticker1', 'ticker2', 'fixed_quarter_date']).agg({'count': 'sum'}).reset_index()

pairwise_df

Unnamed: 0,ticker1,ticker2,fixed_quarter_date,count
0,AAPL,ACIW,2015-01-01,1
1,AAPL,ADP,2012-04-01,1
2,AAPL,ADSK,2014-07-01,1
3,AAPL,ADSK,2016-01-01,1
4,AAPL,ALGT,2015-01-01,1
...,...,...,...,...
2681,TMUS,VMI,2012-04-01,1
2682,TMUS,VMI,2012-10-01,1
2683,TOL,VGR,2014-01-01,1
2684,WEC,XEL,2013-01-01,1


## Add Company Name and Sector

In [5]:
# Load rating datasets for company name

# Dataset 1
rating_data_1 = pd.read_csv(os.path.expanduser('~/Box/STAT 222 Capstone/Raw Data/Supplementary Credit Rating Data From Kaggle/corporateCreditRatingWithFinancialRatios.csv'))
print(rating_data_1.columns)
# Keep Ticker and Corporation, rename Ticker to ticker, Corporation to company
rating_data_1 = rating_data_1[['Ticker', 'Corporation']].rename(columns={'Ticker': 'ticker', 'Corporation': 'company'})
# Drop duplicates
rating_data_1.drop_duplicates(inplace=True)

# Dataset 2
rating_data_2 = pd.read_csv(os.path.expanduser('~/Box/STAT 222 Capstone/Raw Data/Credit Rating Data From Kaggle/corporate_rating.csv'))
print(rating_data_2.columns)
# Keep Symbol and Name, rename Symbol to ticker, Name to company
rating_data_2 = rating_data_2[['Symbol', 'Name']].rename(columns={'Symbol': 'ticker', 'Name': 'company'})
# Drop duplicates
rating_data_2.drop_duplicates(inplace=True)

# Stack the two datasets and take a random item for each duplicate on ticker
rating_data_names = pd.concat([rating_data_1, rating_data_2], axis=0)
rating_data_names.drop_duplicates(subset='ticker', keep='first', inplace=True)
rating_data_names


Index(['Rating Agency', 'Corporation', 'Rating', 'Rating Date', 'CIK',
       'Binary Rating', 'SIC Code', 'Sector', 'Ticker', 'Current Ratio',
       'Long-term Debt / Capital', 'Debt/Equity Ratio', 'Gross Margin',
       'Operating Margin', 'EBIT Margin', 'EBITDA Margin',
       'Pre-Tax Profit Margin', 'Net Profit Margin', 'Asset Turnover',
       'ROE - Return On Equity', 'Return On Tangible Equity',
       'ROA - Return On Assets', 'ROI - Return On Investment',
       'Operating Cash Flow Per Share', 'Free Cash Flow Per Share'],
      dtype='object')
Index(['Rating', 'Name', 'Symbol', 'Rating Agency Name', 'Date', 'Sector',
       'currentRatio', 'quickRatio', 'cashRatio', 'daysOfSalesOutstanding',
       'netProfitMargin', 'pretaxProfitMargin', 'grossProfitMargin',
       'operatingProfitMargin', 'returnOnAssets', 'returnOnCapitalEmployed',
       'returnOnEquity', 'assetTurnover', 'fixedAssetTurnover',
       'debtEquityRatio', 'debtRatio', 'effectiveTaxRate',
       'freeCashFl

Unnamed: 0,ticker,company
0,AWR,American States Water Co.
1,ADP,Automatic Data Processing Inc.
2,AVT,Avnet Inc.
3,CWT,California Water Service Co.
4,CAH,Cardinal Health Inc.
...,...,...
2019,IRS,IRSA Inversiones Y Representaciones S.A.
2020,IT,"Gartner, Inc."
2022,XPER,Xperi Holding Corporation
2023,IMO,Imperial Oil Limited


In [6]:
# Need to reload all data, just ticker and sector columns
# list of files in '../../../Data/All_Data/All_Data_Fixed_Quarter_Dates'
file_list = [f for f in os.listdir(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates') if f.endswith('.parquet')]
# read in all parquet files
sector_df = pd.concat([pd.read_parquet(r'../../../Data/All_Data/All_Data_Fixed_Quarter_Dates/' + f, columns=['ticker', 'Sector']) for f in file_list]).drop_duplicates()
sector_df

Unnamed: 0,ticker,Sector
0,AAPL,Information Technology
10,ABB,Industrials
14,ABBV,Health Care
23,ABC,Health Care
46,ABG,Consumer Discretionary
...,...,...
772,XRAY,Health Care
794,XYL,Industrials
795,YUM,Consumer Discretionary
815,ZBRA,Information Technology


In [7]:
# Inner join rating_data_names and sector_df on ticker
rating_data_names_sector = rating_data_names.merge(sector_df, on='ticker', how='inner')
rating_data_names_sector

Unnamed: 0,ticker,company,Sector
0,AWR,American States Water Co.,Utilities
1,ADP,Automatic Data Processing Inc.,Information Technology
2,AVT,Avnet Inc.,Information Technology
3,CWT,California Water Service Co.,Utilities
4,CAH,Cardinal Health Inc.,Health Care
...,...,...,...
494,RFP,Resolute Forest Products Inc.,Materials
495,ALKS,Alkermes plc,Health Care
496,EA,Electronic Arts Inc.,Communication Services
497,MX,MagnaChip Semiconductor Corporation,Information Technology


In [8]:
# Left join on ticker1
pairwise_df = pairwise_df.merge(rating_data_names_sector, left_on='ticker1', right_on='ticker', how='left')
pairwise_df.drop(columns='ticker', inplace=True)
pairwise_df.rename(columns={'company': 'company1', 'Sector': 'Sector1'}, inplace=True)
# Left join on ticker2
pairwise_df = pairwise_df.merge(rating_data_names_sector, left_on='ticker2', right_on='ticker', how='left')
pairwise_df.drop(columns='ticker', inplace=True)
pairwise_df.rename(columns={'company': 'company2', 'Sector': 'Sector2'}, inplace=True)
# Put variable names containing 1 first, then 2, then fixed_quarter_date and count
pairwise_df = pairwise_df[['ticker1', 'company1', 'Sector1', 'ticker2', 'company2', 'Sector2', 'fixed_quarter_date', 'count']]
# Number of companies
print('number companies')
print(len(set(list(pairwise_df['ticker1'].unique()) + list(pairwise_df['ticker2'].unique()))))
# Number of calls (count of concatenated tickers and fixed_quarter_date)
print('number calls')
print(len(set(list(pairwise_df['ticker1'] + pairwise_df['fixed_quarter_date']) + list(pairwise_df['ticker2'] + pairwise_df['fixed_quarter_date']))))
pairwise_df

number companies
386
number calls
2783


Unnamed: 0,ticker1,company1,Sector1,ticker2,company2,Sector2,fixed_quarter_date,count
0,AAPL,Apple Inc.,Information Technology,ACIW,ACI Worldwide Inc.,Information Technology,2015-01-01,1
1,AAPL,Apple Inc.,Information Technology,ADP,Automatic Data Processing Inc.,Information Technology,2012-04-01,1
2,AAPL,Apple Inc.,Information Technology,ADSK,"Autodesk, Inc.",Information Technology,2014-07-01,1
3,AAPL,Apple Inc.,Information Technology,ADSK,"Autodesk, Inc.",Information Technology,2016-01-01,1
4,AAPL,Apple Inc.,Information Technology,ALGT,Allegiant Travel Company,Industrials,2015-01-01,1
...,...,...,...,...,...,...,...,...
2681,TMUS,T-Mobile US Inc.,Communication Services,VMI,Valmont Industries Inc.,Industrials,2012-04-01,1
2682,TMUS,T-Mobile US Inc.,Communication Services,VMI,Valmont Industries Inc.,Industrials,2012-10-01,1
2683,TOL,Toll Brothers Inc.,Consumer Discretionary,VGR,Vector Ltd.,Consumer Staples,2014-01-01,1
2684,WEC,WEC Energy Group Inc.,Utilities,XEL,Xcel Energy Inc.,Utilities,2013-01-01,1


## Plotting

In [9]:
# Keep company1, company2, fixed_quarter_date, cosine_similarity
nw_cols = pairwise_df[['company1', 'company2', 'ticker1', 'ticker2', 'fixed_quarter_date', 'count']]
# Where missing, fill company1 with ticker1 and company2 with ticker2
nw_cols['company1'].fillna(nw_cols['ticker1'], inplace=True)
nw_cols['company2'].fillna(nw_cols['ticker2'], inplace=True)
# Concatenate fixed_quarter_date with company1 and company2
nw_cols['company1'] = nw_cols['company1'] + ':' + nw_cols['fixed_quarter_date']
nw_cols['company2'] = nw_cols['company2'] + ':' + nw_cols['fixed_quarter_date']
# Rename to 'Source', 'Target', 'Weight'
nw_cols.rename(columns={'company1': 'Source', 'company2': 'Target', 'count': 'weight'}, inplace=True)
# Add column 'Type' with value 'Undirected'
nw_cols['Type'] = 'Undirected'
# Limit to just node cols
#nw_cols = nw_cols[['Source', 'Target']]
print(nw_cols)
# Load as graph
G = nx.from_pandas_edgelist(nw_cols, 
                            source='Source', 
                            target='Target', 
                            edge_attr='weight'
                            )

                                Source  \
0                Apple Inc.:2015-01-01   
1                Apple Inc.:2012-04-01   
2                Apple Inc.:2014-07-01   
3                Apple Inc.:2016-01-01   
4                Apple Inc.:2015-01-01   
...                                ...   
2681       T-Mobile US Inc.:2012-04-01   
2682       T-Mobile US Inc.:2012-10-01   
2683     Toll Brothers Inc.:2014-01-01   
2684  WEC Energy Group Inc.:2013-01-01   
2685  WEC Energy Group Inc.:2014-01-01   

                                         Target ticker1 ticker2  \
0                 ACI Worldwide Inc.:2015-01-01    AAPL    ACIW   
1     Automatic Data Processing Inc.:2012-04-01    AAPL     ADP   
2                     Autodesk, Inc.:2014-07-01    AAPL    ADSK   
3                     Autodesk, Inc.:2016-01-01    AAPL    ADSK   
4           Allegiant Travel Company:2015-01-01    AAPL    ALGT   
...                                         ...     ...     ...   
2681         Valmont Indus

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nw_cols['company1'].fillna(nw_cols['ticker1'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nw_cols['company2'].fillna(nw_cols['ticker2'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nw_cols['company1'] = nw_cols['company1'] + ':' + nw_cols['fixed_quarter_date']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

In [10]:
# Pyvis network
pvn = Network(notebook=True)
pvn.from_nx(G)
#pvn.toggle_physics(True) # turn off movement
# add physics controls
pvn.show_buttons(filter_=['physics'])
# use repulsion
pvn.repulsion(node_distance=100)
pvn.show('../../../Output/Company Mentions Network/Call_Company_Mentions_Network_Direct.html')

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
