In [47]:
import pandas as pd
from sqlalchemy import create_engine
import statsmodels.api as sm
from linearmodels.panel import PanelOLS

In [3]:
engine = create_engine("sqlite:///GDS.db")
df_emissions = pd.read_sql_table('Emissions_Data_Joined', engine)
df_emissions['Year'] = df_emissions['Year'].astype(int)

df_sectors = pd.read_sql_table('Sectors', engine)
df_fin = pd.read_sql_table('FinancialsTableFinal', engine)

In [4]:
df_fin['Year'] = df_fin['Date'].str[:4].astype(int)+1
df_fin.drop(columns='Date', inplace=True)

In [23]:
df_panel = df_emissions.merge(df_sectors, on='Company Name').merge(
    df_fin, on=['Company Name', 'Year']).set_index(['Company Name', 'Year'])

df_panel = df_panel.loc[~df_panel['Capital Expenditures'].isna()] # Drop rows with missing values

In [24]:
primary = ['Industrials', 'Materials', 'Utilities']
df_panel.loc[~df_panel['Industry'].isin(primary), 'Industry'] = 'Tertiary'
df_panel.loc[df_panel['Industry'].isin(primary), 'Industry'] = 'Primary'

df_panel['Industry'].value_counts()

Tertiary    239
Primary     127
Name: Industry, dtype: int64

In [25]:
df_panel = pd.get_dummies(df_panel, columns=['Industry'], drop_first=True)

In [26]:
df_panel

Unnamed: 0_level_0,Unnamed: 1_level_0,Scope_1_emissions_(tCO₂e),Scope_2_emissions_(tCO₂e)_Location,Scope_2_emissions_(tCO₂e)_Market,Revenue,"Net Profit Margin, (%)",Current Ratio,ROE,Debt to Equity,Capital Expenditures,Industry_Tertiary
Company Name,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
37 Interactive Entertainment,2018,-,-,-,9.512055e+08,0.261856,2.60977,0.230958,0.105192,0.111223,1
37 Interactive Entertainment,2019,129,2701,-,1.110127e+09,0.132130,2.09920,0.168853,0.113814,0.038649,1
37 Interactive Entertainment,2020,169,2395,-,1.899959e+09,0.159881,2.44392,0.300850,0.082679,0.036295,1
37 Interactive Entertainment,2021,152,1794,-,2.206851e+09,0.191737,1.28937,0.457309,0.212455,0.101443,1
Accor,2018,280706,1829278,-,3.327695e+09,0.123288,1.22390,0.062845,0.552187,0.041970,1
...,...,...,...,...,...,...,...,...,...,...,...
Xylem Inc,2021,47707,44569,18214,4.876000e+09,0.052092,1.80112,0.085580,1.039084,0.020914,0
"Yum! Brands, Inc.",2018,48047,155439,159403,5.878000e+09,0.301803,1.65807,-0.280076,-1.547837,0.059876,1
"Yum! Brands, Inc.",2019,23545,140341,144313,5.688000e+09,0.259494,0.92775,-0.186223,-1.270754,0.056659,1
"Yum! Brands, Inc.",2020,38907,128129,130953,5.597000e+09,0.231195,0.99091,-0.161427,-1.317615,0.037469,1


Want to seperate out each of the emission scopes so we can take all observations with values for that scope.
i.e. We want to include a company which has Scope 1 emissions but no Scope 2 Location Based emissions in our model for Scope 1 emissions. This would not be possible if we just dropped them from main df. 

In [72]:
x_vars = df_panel.columns[3:]

df_scope1 = df_panel[x_vars].merge(df_panel['Scope_1_emissions_(tCO₂e)'],
                                    left_index=True, right_index=True)
df_scope2_loc = df_panel[x_vars].merge(df_panel['Scope_2_emissions_(tCO₂e)_Location'],
                                        left_index=True, right_index=True)
df_scope2_mkt = df_panel[x_vars].merge(df_panel['Scope_2_emissions_(tCO₂e)_Market'],
                                        left_index=True, right_index=True)

In [75]:
# Drop observations without emissions data
df_scope1 = df_scope1.loc[df_scope1['Scope_1_emissions_(tCO₂e)'] != '-'].astype(float)
df_scope2_loc = df_scope2_loc.loc[df_scope2_loc['Scope_2_emissions_(tCO₂e)_Location'] != '-'].astype(float)
df_scope2_mkt = df_scope2_mkt.loc[df_scope2_mkt['Scope_2_emissions_(tCO₂e)_Market'] != '-'].astype(float)

In [78]:
x = sm.add_constant(df_scope1[x_vars])
y = df_scope1[['Scope_1_emissions_(tCO₂e)']]

model1 = PanelOLS(dependent=y, exog=x, time_effects=True)
res1 = model1.fit()
res1.summary

0,1,2,3
Dep. Variable:,Scope_1_emissions_(tCO₂e),R-squared:,0.1806
Estimator:,PanelOLS,R-squared (Between):,0.1943
No. Observations:,354,R-squared (Within):,-0.5604
Date:,"Wed, Jun 21 2023",R-squared (Overall):,0.1803
Time:,22:06:42,Log-likelihood,-6041.8
Cov. Estimator:,Unadjusted,,
,,F-statistic:,10.803
Entities:,92,P-value,0.0000
Avg Obs:,3.8478,Distribution:,"F(7,343)"
Min Obs:,2.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,4.759e+06,9.194e+05,5.1757,0.0000,2.95e+06,6.567e+06
Revenue,2.1e-05,7.81e-06,2.6889,0.0075,5.639e-06,3.636e-05
"Net Profit Margin, (%)",-2.255e+06,2.016e+06,-1.1186,0.2641,-6.22e+06,1.71e+06
Current Ratio,-3.947e+05,3.207e+05,-1.2306,0.2193,-1.025e+06,2.361e+05
ROE,-7.467e+05,3.805e+05,-1.9622,0.0505,-1.495e+06,1785.5
Debt to Equity,1.34e+05,6.746e+04,1.9861,0.0478,1293.2,2.667e+05
Capital Expenditures,3.027e+07,1.301e+07,2.3263,0.0206,4.676e+06,5.586e+07
Industry_Tertiary,-5.397e+06,7.344e+05,-7.3485,0.0000,-6.842e+06,-3.952e+06


In [81]:
x = sm.add_constant(df_scope2_loc[x_vars])
y = df_scope2_loc[['Scope_2_emissions_(tCO₂e)_Location']]

model2_loc = PanelOLS(dependent=y, exog=x, time_effects=True)
res2_loc = model2_loc.fit()
res2_loc.summary

0,1,2,3
Dep. Variable:,Scope_2_emissions_(tCO₂e)_Location,R-squared:,0.1739
Estimator:,PanelOLS,R-squared (Between):,0.2022
No. Observations:,349,R-squared (Within):,-0.1434
Date:,"Wed, Jun 21 2023",R-squared (Overall):,0.1679
Time:,22:09:52,Log-likelihood,-5380.1
Cov. Estimator:,Unadjusted,,
,,F-statistic:,10.168
Entities:,92,P-value,0.0000
Avg Obs:,3.7935,Distribution:,"F(7,338)"
Min Obs:,2.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,5.68e+05,1.771e+05,3.2063,0.0015,2.195e+05,9.164e+05
Revenue,1.146e-05,1.561e-06,7.3395,0.0000,8.386e-06,1.453e-05
"Net Profit Margin, (%)",-5.711e+05,3.871e+05,-1.4754,0.1410,-1.333e+06,1.903e+05
Current Ratio,-6.073e+04,6.151e+04,-0.9873,0.3242,-1.817e+05,6.026e+04
ROE,-1.939e+05,7.415e+04,-2.6157,0.0093,-3.398e+05,-4.81e+04
Debt to Equity,3.723e+04,1.319e+04,2.8235,0.0050,1.129e+04,6.317e+04
Capital Expenditures,2.441e+06,2.486e+06,0.9820,0.3268,-2.449e+06,7.331e+06
Industry_Tertiary,-1.495e+05,1.42e+05,-1.0525,0.2933,-4.288e+05,1.299e+05


In [80]:
x = sm.add_constant(df_scope2_mkt[x_vars])
y = df_scope2_mkt[['Scope_2_emissions_(tCO₂e)_Market']]

model2_mkt = PanelOLS(dependent=y, exog=x, time_effects=True)
res2_mkt = model2_mkt.fit()
res2_mkt.summary

0,1,2,3
Dep. Variable:,Scope_2_emissions_(tCO₂e)_Market,R-squared:,0.1281
Estimator:,PanelOLS,R-squared (Between):,0.1452
No. Observations:,322,R-squared (Within):,-0.7138
Date:,"Wed, Jun 21 2023",R-squared (Overall):,0.1267
Time:,22:09:00,Log-likelihood,-4921.5
Cov. Estimator:,Unadjusted,,
,,F-statistic:,6.5269
Entities:,91,P-value,0.0000
Avg Obs:,3.5385,Distribution:,"F(7,311)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,6.555e+05,1.624e+05,4.0355,0.0001,3.359e+05,9.751e+05
Revenue,7.256e-06,1.516e-06,4.7876,0.0000,4.274e-06,1.024e-05
"Net Profit Margin, (%)",-4.581e+05,3.444e+05,-1.3301,0.1845,-1.136e+06,2.196e+05
Current Ratio,-3.941e+04,5.503e+04,-0.7163,0.4743,-1.477e+05,6.886e+04
ROE,-1.749e+05,6.751e+04,-2.5909,0.0100,-3.077e+05,-4.207e+04
Debt to Equity,3.264e+04,1.201e+04,2.7168,0.0070,9000.7,5.628e+04
Capital Expenditures,1.914e+06,2.453e+06,0.7803,0.4358,-2.912e+06,6.74e+06
Industry_Tertiary,-4.09e+05,1.281e+05,-3.1942,0.0015,-6.61e+05,-1.571e+05
