# Financial Econometrics II: Final Project

*By Basri Satiroglu, Daniel Deutsch, Dimitry Budarin, and José Lucas Barretto*

In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import statsmodels.api as sm
from linearmodels import PanelOLS, PooledOLS, RandomEffects
from statsmodels.sandbox.regression.gmm import IV2SLS

In [2]:
# Ignore warnings
warnings.filterwarnings('ignore')

# Matplotlib styles
plt.style.use('ggplot')
plt.rcParams.update({
    'figure.figsize': (15, 7),
    'axes.prop_cycle': plt.cycler(color=['#4C72B0', '#C44E52', '#55A868', '#8172B2', '#CCB974', '#64B5CD']),
    'axes.facecolor': '#EAEAF2'
})

# Section 1

## Question 1 and 2

In [3]:
try:

    # Loads the processed dataset
    df = pd.read_csv("./datasets/Diane-data.csv.gz", index_col=0)

except:

    # Loads the raw dataset
    df = pd.read_stata("./datasets/Diane-data.dta")

    # Filters the dataframe to consider only our group's department
    df = df[df['départementcode'] == 33]

    # Selects only useful columns
    default_cols = [
        'marketable_sec', 'trans', 'nettangible', 'land', 'totalDebt', 'netsales', 'year', 'man',
        'taxes', 'netincome', 'totalfixedassets', 'elec_eau', 'plantAequip', 'cons', 'creditors', 'tradedebts',
        'totalassets', 'trade', 'siren', 'ict', 'companyname', 'cashNbank', 'otherdebts', 'real',
        'inventory', 'agr', 'buildings'
    ]
    df = df[default_cols]

    # Removes incomplete rows
    df.dropna(subset=default_cols, inplace=True)

    # Saves the processed dataset
    df.to_csv("./datasets/Diane-data.csv.gz")


## Question 3

In [4]:
# Filters the dataframe to samples that respects the mask's constraints
mask = (df['cashNbank'] + df['marketable_sec'] >= 0) & (df['totalassets'] >= 1)
df = df[mask]

# Computes the ratio and its lag
df['cash_mktsec_ratio'] = ((df['cashNbank'] + df['marketable_sec']) / df['totalassets'])
df['cash_mktsec_ratio_lag'] = df.groupby('siren')['cash_mktsec_ratio'].shift()

# Checks that 1st row in the lagged column is Nan
df[['cash_mktsec_ratio', 'cash_mktsec_ratio_lag']]

Unnamed: 0,cash_mktsec_ratio,cash_mktsec_ratio_lag
5630,0.023810,
5631,0.483010,0.023810
5632,0.252500,0.483010
5633,0.000000,0.252500
11510,0.020430,
...,...,...
4772870,0.077701,0.000558
4772871,0.080455,0.077701
4772872,0.278290,0.080455
4779584,0.217247,


## Question 4

### Computation of Control Variables

In [5]:
# Compute auxiliary variables
df['industry'] = df[['man','trade', 'elec_eau', 'agr', 'trans', 'ict', 'real', 'cons']].idxmax(axis=1)

# Compute main variables
df['CASH(i, t)'] = df['cashNbank']
df['GROWTH(i, t)'] = df['netsales'].pct_change()
df['SIZE(i, t)'] = np.log(df['totalassets'])
df['CFLOW(i, t)'] = df['netincome'] / df['totalassets']
df['NWC(i, t)'] = (df['inventory'] + df['creditors'] - df['tradedebts'] - df['taxes'] - df['otherdebts']) / df['totalassets']
df['LEV(i, t)'] = df['totalDebt'] / df['totalassets']
df['INDUSTRYRISK(i, t)'] = df.groupby('industry')['CFLOW(i, t)'].transform('std')
df['INCASSET(i, t)'] = (df['totalfixedassets'] - df['totalfixedassets'].shift()) / (df['totalassets'] - df['totalassets'].shift())
df['INDUSTRYDEVIATION(i, t)'] = np.abs(df['CASH(i, t)'] - df.groupby('year')['CASH(i, t)'].transform('mean'))

# Compute dummy variables
df['DHIGHGROWTH(i, t)'] = (df['GROWTH(i, t)'] > df.groupby('year')['GROWTH(i, t)'].transform('median')).astype(int)
df['DHIGHINCASSET(i, t)'] = (df['INCASSET(i, t)'] > df.groupby('year')['INCASSET(i, t)'].transform('median')).astype(int)
df['DSMALL(i, t)'] = (df['SIZE(i, t)'] < df.groupby('year')['SIZE(i, t)'].transform('median')).astype(int)
df['DLOWCFLOW(i, t)'] = (df['CFLOW(i, t)'] < df.groupby('year')['CFLOW(i, t)'].transform('median')).astype(int)
# df['DCOV(i, t)'] = 
# df['DZSCORE(i, t)'] = 

# Compute shifted variables
df['CASH(i, t-1)'] = df.groupby('siren')['CASH(i, t)'].shift()
df['GROWTH(i, t-1)'] = df.groupby('siren')['GROWTH(i, t)'].shift()
df['SIZE(i, t-1)'] = df.groupby('siren')['SIZE(i, t)'].shift()
df['CFLOW(i, t-1)'] = df.groupby('siren')['CFLOW(i, t)'].shift()
df['NWC(i, t-1)'] = df.groupby('siren')['NWC(i, t)'].shift()
df['LEV(i, t-1)'] = df.groupby('siren')['LEV(i, t)'].shift()
df['INDUSTRYRISK(i, t-1)'] = df.groupby('siren')['INDUSTRYRISK(i, t)'].shift()
df['INCASSET(i, t-1)'] = df.groupby('siren')['INCASSET(i, t)'].shift()
df['INDUSTRYDEVIATION(i, t-1)'] = df.groupby('siren')['INDUSTRYDEVIATION(i, t)'].shift()

# Compute first differences
df['CASH(i, t) - CASH(i, t-1)'] = df.groupby('siren')['CASH(i, t)'].diff()
df['GROWTH(i, t) - GROWTH(i, t-1)'] = df.groupby('siren')['GROWTH(i, t)'].diff()
df['SIZE(i, t) - SIZE(i, t-1)'] = df.groupby('siren')['SIZE(i, t)'].diff()
df['CFLOW(i, t) - CFLOW(i, t-1)'] = df.groupby('siren')['CFLOW(i, t)'].diff()
df['NWC(i, t) - NWC(i, t-1)'] = df.groupby('siren')['NWC(i, t)'].diff()
df['LEV(i, t) - LEV(i, t-1)'] = df.groupby('siren')['LEV(i, t)'].diff()
# df['INDUSTRYRISK(i, t) - INDUSTRYRISK(i, t-1)'] = df.groupby('siren')['INDUSTRYRISK(i, t)'].diff()
df['INCASSET(i, t) - INCASSET(i, t-1)'] = df.groupby('siren')['INCASSET(i, t)'].diff()
df['INDUSTRYDEVIATION(i, t) - INDUSTRYDEVIATION(i, t-1)'] = df.groupby('siren')['INDUSTRYDEVIATION(i, t)'].diff()

# Compute compound variables
df['CASH(i, t-1) * DHIGHGROWTH(i, t)'] = df['CASH(i, t-1)'] * df['DHIGHGROWTH(i, t)']
df['CASH(i, t-1) * DHIGHINCASSET(i, t)'] = df['CASH(i, t-1)'] * df['DHIGHINCASSET(i, t)']
df['CASH(i, t-1) * DSMALL(i, t)'] = df['CASH(i, t-1)'] * df['DSMALL(i, t)']
df['CASH(i, t-1) * DLOWCFLOW(i, t)'] = df['CASH(i, t-1)'] * df['DLOWCFLOW(i, t)']
# df['CASH(i, t-1) * DCOV(i, t)'] = df['CASH(i, t-1)'] * df['DCOV(i, t)']
# df['CASH(i, t-1) * DZSCORE(i, t)'] = df['CASH(i, t-1)'] * df['DZSCORE(i, t)']

# Creates arrays with column names
main_cols = ['CASH(i, t)', 'GROWTH(i, t)', 'SIZE(i, t)', 'CFLOW(i, t)', 'NWC(i, t)', 'LEV(i, t)', 'INDUSTRYRISK(i, t)', 'INCASSET(i, t)', 'INDUSTRYDEVIATION(i, t)']
dummy_cols = ['DHIGHGROWTH(i, t)', 'DHIGHINCASSET(i, t)', 'DSMALL(i, t)']# , 'DCOV(i, t)', 'DZSCORE(i, t)', 'DZSCORE(i, t)']
shifted_cols = ['CASH(i, t-1)', 'GROWTH(i, t-1)', 'SIZE(i, t-1)', 'CFLOW(i, t-1)', 'NWC(i, t-1)', 'LEV(i, t-1)', 'INCASSET(i, t-1)', 'INDUSTRYDEVIATION(i, t-1)'] # ,'INDUSTRYRISK(i, t-1)']
diff_cols = ['CASH(i, t) - CASH(i, t-1)', 'GROWTH(i, t) - GROWTH(i, t-1)', 'SIZE(i, t) - SIZE(i, t-1)', 'CFLOW(i, t) - CFLOW(i, t-1)', 'NWC(i, t) - NWC(i, t-1)', 'LEV(i, t) - LEV(i, t-1)', 'INCASSET(i, t) - INCASSET(i, t-1)', 'INDUSTRYDEVIATION(i, t) - INDUSTRYDEVIATION(i, t-1)'] # ,'INDUSTRYRISK(i, t) - INDUSTRYRISK(i, t-1)'
compound_cols = ['CASH(i, t-1) * DHIGHGROWTH(i, t)', 'CASH(i, t-1) * DHIGHINCASSET(i, t)', 'CASH(i, t-1) * DSMALL(i, t)', 'CASH(i, t-1) * DLOWCFLOW(i, t)'] # , 'CASH(i, t-1) * DCOV(i, t)', 'CASH(i, t-1) * DZSCORE(i, t)']

# Defines the indexes of the dataframe
df.set_index(['siren', 'year'], drop=True, inplace=True)

# Removes incomplete rows
df.dropna(subset=main_cols, inplace=True) 

### Remotion of Extreme Variables

In [6]:
# Removes samples that have outliers in the main_cols
for col in main_cols:
    df = df[(df[col] > df[col].quantile(0.1)) & (df[col] < df[col].quantile(0.9))]

### Computation of Between Transformed Variables

In [7]:
df_between = df.groupby('siren').mean()

### Computation of Within Transformed Variables

In [8]:
df_within = df.subtract(df.groupby('siren').mean(), level=0)

## Question 5

In [9]:
table = df.groupby('year')['cash_mktsec_ratio'].describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])
table.loc['2014-2018'] = df['cash_mktsec_ratio'].describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])

table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
    },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014,288,0.293157,0.216061,0.252151,0.042623,0.605042
2015,469,0.275916,0.217205,0.230321,0.036101,0.61248
2016,368,0.280992,0.205267,0.247047,0.051235,0.567341
2017,208,0.346032,0.205195,0.320289,0.111566,0.627127
2018,62,0.393094,0.202227,0.349461,0.127805,0.716152
2014-2018,1395,0.296477,0.213481,0.258438,0.047986,0.608643


# Section 2

## Question 6

### Cash Holdings per Year for Within Transform

In [10]:
table = df_within.groupby('year')['cash_mktsec_ratio'].describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])
table.loc['2014-2018'] = df_within['cash_mktsec_ratio'].describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])

table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
        },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014,288,-0.005648478,0.047739,0.0,-0.054305,0.033929
2015,469,-0.00332353,0.044175,0.0,-0.056699,0.045603
2016,368,0.001344324,0.048057,0.0,-0.053369,0.059827
2017,208,0.009771736,0.053078,0.0,-0.050367,0.079889
2018,62,0.01061718,0.057173,0.0,-0.046406,0.093943
2014-2018,1395,5.421788e-19,0.048206,0.0,-0.053066,0.053567


### Cash Holdings per Year for First Difference

In [11]:
table = (df.groupby('siren')['cash_mktsec_ratio'].diff()).groupby('year').describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])
table.loc['2014-2018'] = (df.groupby('siren')['cash_mktsec_ratio'].diff()).describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])

table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
    },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014,0,,,,,
2015,118,0.017735,0.114325,0.015825,-0.103905,0.154232
2016,189,0.017879,0.106731,0.009758,-0.112251,0.14012
2017,138,0.018878,0.113285,0.00962,-0.12461,0.180707
2018,44,0.018467,0.100064,0.003533,-0.078594,0.157616
2014-2018,489,0.018179,0.109579,0.009931,-0.109074,0.156413


## Question 7

In [12]:
stats = df[main_cols].replace([np.inf, -np.inf], np.nan).dropna()
table = stats.describe(percentiles=[0.1, 0.9]).T.drop(columns=['min', 'max'])
table.index.names = ['Year']
table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
    },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CASH(i, t)",1395,139.850179,122.118007,106.0,20.0,310.6
"GROWTH(i, t)",1395,0.179834,0.6858,0.034091,-0.463229,0.858737
"SIZE(i, t)",1395,6.298585,0.8602,6.291569,5.157889,7.498389
"CFLOW(i, t)",1395,0.073262,0.052648,0.068364,0.006566,0.148475
"NWC(i, t)",1395,-0.224012,0.21704,-0.241313,-0.49497,0.049416
"LEV(i, t)",1395,0.530644,0.175086,0.527888,0.294101,0.781403
"INDUSTRYRISK(i, t)",1395,0.341448,0.194675,0.241195,0.241195,0.828019
"INCASSET(i, t)",1395,0.234103,0.412191,0.071429,-0.166667,0.916998
"INDUSTRYDEVIATION(i, t)",1395,294.954013,100.720607,291.028589,157.028589,445.599472


## Question 8

### Descriptive Stats for Within Transform

In [13]:
stats = df_within[main_cols].replace([np.inf, -np.inf], np.nan).dropna()
table = stats.describe(percentiles=[0.1, 0.9]).T.drop(columns=['min', 'max'])
table.index.names = ['Year']
table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
    },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CASH(i, t)",1395,1.629919e-16,41.82737,0.0,-41.66667,42.933333
"GROWTH(i, t)",1395,2.347783e-18,0.3581817,0.0,-0.3267299,0.293929
"SIZE(i, t)",1395,1.273374e-18,0.1130667,0.0,-0.1122427,0.109944
"CFLOW(i, t)",1395,-3.7803289999999997e-19,0.02346534,0.0,-0.02561462,0.025615
"NWC(i, t)",1395,4.974117e-21,0.06736211,0.0,-0.07111062,0.070479
"LEV(i, t)",1395,-1.293271e-18,0.03848215,0.0,-0.04429721,0.044297
"INDUSTRYRISK(i, t)",1395,-2.208508e-18,1.793672e-17,0.0,-2.775558e-17,0.0
"INCASSET(i, t)",1395,-8.157552999999999e-19,0.2219395,0.0,-0.2361498,0.233433
"INDUSTRYDEVIATION(i, t)",1395,-9.575773e-16,51.26249,0.0,-68.14088,70.119441


### Descriptive Stats for Between Transform

In [14]:
stats = df_between[main_cols].replace([np.inf, -np.inf], np.nan).dropna()
table = stats.describe(percentiles=[0.1, 0.9]).T.drop(columns=['min', 'max'])
table.index.names = ['Year']
table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
    },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CASH(i, t)",906,135.574411,121.029585,101.333333,19.0,289.0
"GROWTH(i, t)",906,0.19901,0.669611,0.037802,-0.422815,1.004139
"SIZE(i, t)",906,6.25642,0.89321,6.235915,5.075174,7.528063
"CFLOW(i, t)",906,0.07384,0.051249,0.070018,0.008566,0.146823
"NWC(i, t)",906,-0.225339,0.215935,-0.252466,-0.493459,0.05059
"LEV(i, t)",906,0.533697,0.174928,0.529451,0.296761,0.784745
"INDUSTRYRISK(i, t)",906,0.346059,0.200809,0.241195,0.241195,0.828019
"INCASSET(i, t)",906,0.247416,0.382202,0.117988,-0.117621,0.802895
"INDUSTRYDEVIATION(i, t)",906,295.129329,92.754239,295.028589,165.528589,429.231213


### Descriptive Stats for First Difference

In [15]:
stats = df[main_cols].diff().replace([np.inf, -np.inf], np.nan).dropna()
table = stats.describe(percentiles=[0.1, 0.9]).T.drop(columns=['min', 'max'])
table.index.names = ['Year']
table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
    },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CASH(i, t)",1394,-0.223816,152.727098,5.0,-180.4,161.0
"GROWTH(i, t)",1394,-0.000352,0.949082,-0.003358,-0.862882,0.920253
"SIZE(i, t)",1394,-0.001072,0.993438,0.070889,-1.3659,1.159884
"CFLOW(i, t)",1394,-1.6e-05,0.067726,-0.001224,-0.087286,0.088748
"NWC(i, t)",1394,3.1e-05,0.274622,0.000557,-0.351626,0.356834
"LEV(i, t)",1394,-0.000191,0.202346,-0.009732,-0.242472,0.272287
"INDUSTRYRISK(i, t)",1394,4.7e-05,0.234255,0.0,-0.298191,0.298191
"INCASSET(i, t)",1394,0.000211,0.573713,0.0,-0.726226,0.766312
"INDUSTRYDEVIATION(i, t)",1394,0.107005,134.982348,10.940709,-187.881418,163.064044


## Question 9

In [16]:
print(f"CASH(i, t) variance: {df['CASH(i, t)'].var()}")
print(f"CASH(i, t) within variance: {df_within['CASH(i, t)'].var()}")
print(f"CASH(i, t) between variance: {df_between['CASH(i, t)'].var()}")

CASH(i, t) variance: 14912.807523282067
CASH(i, t) within variance: 1749.52887374462
CASH(i, t) between variance: 14648.160547995629


## Question 10

### Within Transform

In [17]:
df_within[main_cols].corr()

Unnamed: 0,"CASH(i, t)","GROWTH(i, t)","SIZE(i, t)","CFLOW(i, t)","NWC(i, t)","LEV(i, t)","INDUSTRYRISK(i, t)","INCASSET(i, t)","INDUSTRYDEVIATION(i, t)"
"CASH(i, t)",1.0,-0.0407029,0.4189484,0.1475875,0.2539828,-0.09077077,-1.4339110000000002e-17,-0.05450725,-0.1672885
"GROWTH(i, t)",-0.0407029,1.0,0.1002735,0.08227423,-0.1092396,0.1155973,-6.71072e-18,0.1165038,-0.03472017
"SIZE(i, t)",0.4189484,0.1002735,1.0,0.03610993,0.04634154,0.2415009,-2.17462e-16,-0.03755334,0.1441401
"CFLOW(i, t)",0.1475875,0.08227423,0.03610993,1.0,-0.0950821,-0.2318811,-2.589559e-17,0.04391827,-0.04247717
"NWC(i, t)",0.2539828,-0.1092396,0.04634154,-0.0950821,1.0,0.04101572,-8.261743e-18,-0.04688301,-0.08178744
"LEV(i, t)",-0.09077077,0.1155973,0.2415009,-0.2318811,0.04101572,1.0,1.224666e-17,0.06730261,-0.270954
"INDUSTRYRISK(i, t)",-1.4339110000000002e-17,-6.71072e-18,-2.17462e-16,-2.589559e-17,-8.261743e-18,1.224666e-17,1.0,-2.347923e-18,-3.798056e-17
"INCASSET(i, t)",-0.05450725,0.1165038,-0.03755334,0.04391827,-0.04688301,0.06730261,-2.347923e-18,1.0,-0.1481846
"INDUSTRYDEVIATION(i, t)",-0.1672885,-0.03472017,0.1441401,-0.04247717,-0.08178744,-0.270954,-3.798056e-17,-0.1481846,1.0


### Between Transform

In [18]:
df_between[main_cols].corr()

Unnamed: 0,"CASH(i, t)","GROWTH(i, t)","SIZE(i, t)","CFLOW(i, t)","NWC(i, t)","LEV(i, t)","INDUSTRYRISK(i, t)","INCASSET(i, t)","INDUSTRYDEVIATION(i, t)"
"CASH(i, t)",1.0,0.050293,0.471846,0.129997,-0.038801,-0.175274,-0.00118,-0.153887,-0.602522
"GROWTH(i, t)",0.050293,1.0,0.065839,0.023631,-0.008643,0.119795,0.040037,0.073479,-0.053332
"SIZE(i, t)",0.471846,0.065839,1.0,-0.191151,0.020169,0.095484,-0.007784,-0.042717,-0.312713
"CFLOW(i, t)",0.129997,0.023631,-0.191151,1.0,-0.118064,-0.118856,0.001265,-0.091656,-0.014516
"NWC(i, t)",-0.038801,-0.008643,0.020169,-0.118064,1.0,0.074431,0.139689,-0.016344,0.116931
"LEV(i, t)",-0.175274,0.119795,0.095484,-0.118856,0.074431,1.0,0.043927,0.012664,0.208459
"INDUSTRYRISK(i, t)",-0.00118,0.040037,-0.007784,0.001265,0.139689,0.043927,1.0,0.019219,0.008063
"INCASSET(i, t)",-0.153887,0.073479,-0.042717,-0.091656,-0.016344,0.012664,0.019219,1.0,-0.06008
"INDUSTRYDEVIATION(i, t)",-0.602522,-0.053332,-0.312713,-0.014516,0.116931,0.208459,0.008063,-0.06008,1.0


# Section 3

## Question 11

### Pooled OLS Regression

In [19]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df['CASH(i, t-1)'])
endog = df['CASH(i, t)']

# Fits the OLS to the data
model = PooledOLS(endog, exog)
model_fit_pooled = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit_pooled.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:             CASH(i, t)   R-squared:                        0.5268
Estimator:                  PooledOLS   R-squared (Between):              0.6086
No. Observations:                1019   R-squared (Within):              -0.3500
Date:                Sun, May 15 2022   R-squared (Overall):              0.5268
Time:                        22:59:28   Log-likelihood                   -5977.8
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      1132.0
Entities:                         696   P-value                           0.0000
Avg Obs:                       1.4641   Distribution:                  F(1,1017)
Min Obs:                       1.0000                                           
Max Obs:                       4.0000   F-statistic (robust):             394.45
                            

### Random Effects Regression 

In [20]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df['CASH(i, t-1)'])
endog = df['CASH(i, t)']

# Fits the OLS to the data
model = RandomEffects(endog, exog)
model_fit_re = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit_re.summary)

                        RandomEffects Estimation Summary                        
Dep. Variable:             CASH(i, t)   R-squared:                        0.4755
Estimator:              RandomEffects   R-squared (Between):              0.6062
No. Observations:                1019   R-squared (Within):              -0.3209
Date:                Sun, May 15 2022   R-squared (Overall):              0.5256
Time:                        22:59:28   Log-likelihood                   -5868.6
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      921.84
Entities:                         696   P-value                           0.0000
Avg Obs:                       1.4641   Distribution:                  F(1,1017)
Min Obs:                       1.0000                                           
Max Obs:                       4.0000   F-statistic (robust):             360.03
                            

### Fixed Effects Regression

In [21]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df['CASH(i, t-1)'])
endog = df['CASH(i, t)']

# Fits the OLS to the data
model = PanelOLS(endog, exog)
model_fit_fe = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit_fe.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:             CASH(i, t)   R-squared:                        0.5268
Estimator:                   PanelOLS   R-squared (Between):              0.6086
No. Observations:                1019   R-squared (Within):              -0.3500
Date:                Sun, May 15 2022   R-squared (Overall):              0.5268
Time:                        22:59:28   Log-likelihood                   -5977.8
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      1132.0
Entities:                         696   P-value                           0.0000
Avg Obs:                       1.4641   Distribution:                  F(1,1017)
Min Obs:                       1.0000                                           
Max Obs:                       4.0000   F-statistic (robust):             394.45
                            

### Hausman Test

In [22]:
def hausman(fe, re):
    """Hausman Test method to check whether to use FE or RE.
    """
    b = fe.params
    B = re.params
    v_b = fe.cov
    v_B = re.cov

    df_haus = b[np.abs(b) < 1e8].size
    chi2 = np.dot((b - B).T, np.linalg.inv(v_b - v_B).dot(b - B)) 
 
    pval = scipy.stats.chi2.sf(chi2, df_haus)

    return chi2, df_haus, pval

In [23]:
chi2, df_haus, pval = hausman(model_fit_fe, model_fit_re) 

print(f"Chi-Squared: {chi2}")
print(f"Degrees of Freedom: {df_haus}")
print(f"P-Value: {pval}")

Chi-Squared: 153.33072665886422
Degrees of Freedom: 2
P-Value: 5.065889971656552e-34


Fixed Effect results show significance with a positive coefficient, which suggests that the more cash the SME has had previously, the more it will have in the future. This could be a result of additional cash generated from the interest in the bank.

### Within Fixed Effects Regression

In [24]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df_within['CASH(i, t-1)'])
endog = df_within['CASH(i, t)']

# Fits the OLS to the data
model = PooledOLS(endog, exog)
model_fit_within = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit_within.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:             CASH(i, t)   R-squared:                        0.0007
Estimator:                  PooledOLS   R-squared (Between):          -2.155e-05
No. Observations:                1019   R-squared (Within):               0.0008
Date:                Sun, May 15 2022   R-squared (Overall):              0.0007
Time:                        22:59:28   Log-likelihood                   -5323.9
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      0.6618
Entities:                         696   P-value                           0.4161
Avg Obs:                       1.4641   Distribution:                  F(1,1017)
Min Obs:                       1.0000                                           
Max Obs:                       4.0000   F-statistic (robust):             0.0969
                            

### Between OLS Regression

In [25]:
# Defines endogenous and exogenous variables
df_between_tmp = df_between[['CASH(i, t)', 'CASH(i, t-1)']].replace([np.inf, -np.inf], np.nan).dropna()
exog = sm.tools.tools.add_constant(df_between_tmp['CASH(i, t-1)'])
endog = df_between_tmp['CASH(i, t)']

# Fits the OLS to the data
model = sm.OLS(endog, exog)
model_fit_between = model.fit()

# Shows regression results
print(model_fit_between.summary())

                            OLS Regression Results                            
Dep. Variable:             CASH(i, t)   R-squared:                       0.642
Model:                            OLS   Adj. R-squared:                  0.641
Method:                 Least Squares   F-statistic:                     1245.
Date:                Sun, 15 May 2022   Prob (F-statistic):          5.98e-157
Time:                        22:59:28   Log-Likelihood:                -3981.6
No. Observations:                 696   AIC:                             7967.
Df Residuals:                     694   BIC:                             7976.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           54.6971      3.812     14.347   

### First Difference Regression

In [26]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df['CASH(i, t-1)'])
endog = df['CASH(i, t-1)']

# Fits the OLS to the data
model = PooledOLS(endog, exog)
model_fit_diff = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit_diff.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:           CASH(i, t-1)   R-squared:                        1.0000
Estimator:                  PooledOLS   R-squared (Between):              1.0000
No. Observations:                1019   R-squared (Within):               1.0000
Date:                Sun, May 15 2022   R-squared (Overall):              1.0000
Time:                        22:59:28   Log-likelihood                 2.922e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                   2.653e+33
Entities:                         696   P-value                           0.0000
Avg Obs:                       1.4641   Distribution:                  F(1,1017)
Min Obs:                       1.0000                                           
Max Obs:                       4.0000   F-statistic (robust):          3.509e+32
                            

## Question 12

### Within OLS Results

In [27]:
table_params = model_fit_within.params
table_pvalues = round(model_fit_within.pvalues, 2)
within_table_params = pd.DataFrame(table_params)
within_table_pvalues = pd.DataFrame(table_pvalues)
within_table_params = pd.concat([within_table_params, within_table_pvalues], axis =1)
within_table_params

Unnamed: 0,parameter,pvalue
const,2.746974,0.0
"CASH(i, t-1)",-0.031899,0.76


### Between OLS Results

In [28]:
table_params = model_fit_between.params
table_pvalues = round(model_fit_between.pvalues, 2)
between_table_params = pd.DataFrame(table_params)
between_table_pvalues = pd.DataFrame(table_pvalues)
between_table_params = pd.concat([between_table_params, between_table_pvalues], axis =1)
between_table_params.columns = ("parameter", "pvalue")
between_table_params

Unnamed: 0,parameter,pvalue
const,54.697095,0.0
"CASH(i, t-1)",0.686025,0.0


## Question 13

### OLS

In [29]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df[main_cols + dummy_cols + compound_cols])
endog = df['GROWTH(i, t)']

# Fits the OLS to the data
model = PooledOLS(endog, exog)
model_fit = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:           GROWTH(i, t)   R-squared:                        1.0000
Estimator:                  PooledOLS   R-squared (Between):              1.0000
No. Observations:                1019   R-squared (Within):               1.0000
Date:                Sun, May 15 2022   R-squared (Overall):              1.0000
Time:                        22:59:29   Log-likelihood                 3.525e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                   1.363e+32
Entities:                         696   P-value                           0.0000
Avg Obs:                       1.4641   Distribution:                 F(16,1002)
Min Obs:                       1.0000                                           
Max Obs:                       4.0000   F-statistic (robust):          7.326e+31
                            

### OLS Within

In [30]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df_within[main_cols + dummy_cols + compound_cols])
endog = df_within['GROWTH(i, t)']

# Fits the OLS to the data
model = PooledOLS(endog, exog, check_rank=False)
model_fit = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:           GROWTH(i, t)   R-squared:                        1.0000
Estimator:                  PooledOLS   R-squared (Between):              1.0000
No. Observations:                1019   R-squared (Within):               1.0000
Date:                Sun, May 15 2022   R-squared (Overall):              1.0000
Time:                        22:59:29   Log-likelihood                 3.484e+04
Cov. Estimator:             Clustered                                           
                                        F-statistic:                   4.283e+31
Entities:                         696   P-value                           0.0000
Avg Obs:                       1.4641   Distribution:                 F(16,1002)
Min Obs:                       1.0000                                           
Max Obs:                       4.0000   F-statistic (robust):          9.465e+30
                            

### OLS Between

In [31]:
# Defines endogenous and exogenous variables
df_between_tmp = df_between[main_cols + dummy_cols + compound_cols].replace([np.inf, -np.inf], np.nan).dropna()
exog = sm.tools.tools.add_constant(df_between_tmp)
endog = df_between_tmp['GROWTH(i, t)']

# Fits the OLS to the data
model = sm.OLS(endog, exog)
model_fit = model.fit()

# Shows regression results
print(model_fit.summary())

                            OLS Regression Results                            
Dep. Variable:           GROWTH(i, t)   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 6.128e+27
Date:                Sun, 15 May 2022   Prob (F-statistic):               0.00
Time:                        22:59:29   Log-Likelihood:                 20588.
No. Observations:                 696   AIC:                        -4.114e+04
Df Residuals:                     679   BIC:                        -4.106e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons

### OLS First Differences

In [32]:
# Defines endogenous and exogenous variables
df_diff = df[main_cols + dummy_cols + compound_cols].shift().replace([np.inf, -np.inf], np.nan).dropna()
exog = sm.tools.tools.add_constant(df_diff)
endog = df_diff['GROWTH(i, t)']

# Fits the OLS to the data
model = sm.OLS(endog, exog)
model_fit = model.fit()

# Shows regression results
print(model_fit.summary())

                            OLS Regression Results                            
Dep. Variable:           GROWTH(i, t)   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.455e+28
Date:                Sun, 15 May 2022   Prob (F-statistic):               0.00
Time:                        22:59:29   Log-Likelihood:                 30593.
No. Observations:                1019   AIC:                        -6.115e+04
Df Residuals:                    1002   BIC:                        -6.107e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons

## Question 14

In [33]:
df_tmp

NameError: name 'df_tmp' is not defined

In [None]:
# Defines endogenous and exogenous variables
df_tmp = df[main_cols + diff_cols + ['GROWTH(i, t-1)']].replace([np.inf, -np.inf], np.nan).dropna()
exog = sm.tools.tools.add_constant(df_tmp[diff_cols])
instr = sm.tools.tools.add_constant(df_tmp[main_cols])
endog = df_tmp['GROWTH(i, t-1)']

# Fits the OLS to the data
model = IV2SLS(endog, exog, instrument=instr)
model_fit = model.fit()

# Shows regression results
print(model_fit.summary())

                          IV2SLS Regression Results                           
Dep. Variable:         GROWTH(i, t-1)   R-squared:                       0.998
Model:                         IV2SLS   Adj. R-squared:                  0.998
Method:                     Two Stage   F-statistic:                     1299.
                        Least Squares   Prob (F-statistic):               0.00
Date:                Sun, 15 May 2022                                         
Time:                        22:59:02                                         
No. Observations:                1003                                         
Df Residuals:                     994                                         
Df Model:                           8                                         
                                                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------

## Question 15

In [None]:
# Defines endogenous and exogenous variables
df_tmp = df[diff_cols + ['CASH(i, t)']].replace([np.inf, -np.inf], np.nan).dropna()
exog = sm.tools.tools.add_constant(df_tmp[diff_cols])
endog = df_tmp['CASH(i, t)']

# Fits the OLS to the data
model = PooledOLS(endog, exog)
model_fit = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:             CASH(i, t)   R-squared:                        0.1065
Estimator:                  PooledOLS   R-squared (Between):              0.0732
No. Observations:                1003   R-squared (Within):               0.4327
Date:                Sun, May 15 2022   R-squared (Overall):              0.1065
Time:                        22:35:25   Log-likelihood                   -6206.5
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      14.804
Entities:                       13590   P-value                           0.0000
Avg Obs:                       0.0738   Distribution:                   F(8,994)
Min Obs:                       0.0000                                           
Max Obs:                       4.0000   F-statistic (robust):             15.931
                            

In [None]:
exog_instr = df[main_cols + diff_cols + ['CASH(i, t-1)']]
rho = exog_instr.corr()
pval = exog_instr.corr(method=lambda x, y: scipy.stats.pearsonr(x, y)[1]) - np.eye(*rho.shape)

rho

Unnamed: 0,"CASH(i, t)","GROWTH(i, t)","SIZE(i, t)","CFLOW(i, t)","NWC(i, t)","LEV(i, t)","INDUSTRYRISK(i, t)","INCASSET(i, t)","INDUSTRYDEVIATION(i, t)","CASH(i, t) - CASH(i, t-1)","GROWTH(i, t) - GROWTH(i, t-1)","SIZE(i, t) - SIZE(i, t-1)","CFLOW(i, t) - CFLOW(i, t-1)","NWC(i, t) - NWC(i, t-1)","LEV(i, t) - LEV(i, t-1)","INCASSET(i, t) - INCASSET(i, t-1)","INDUSTRYDEVIATION(i, t) - INDUSTRYDEVIATION(i, t-1)","CASH(i, t-1)"
"CASH(i, t)",1.0,0.026028,0.456622,0.12604,-0.020107,-0.1715,-0.005499,-0.129217,-0.541225,0.250888,0.001159,0.096824,-0.006763,0.10938,0.08389,0.053433,-0.281362,0.725784
"GROWTH(i, t)",0.026028,1.0,0.067989,0.040285,-0.044682,0.108516,0.025822,0.083876,-0.05465,0.118793,0.03653,0.501387,0.238288,-0.080406,0.125861,0.031284,-0.0459,-0.045916
"SIZE(i, t)",0.456622,0.067989,1.0,-0.172965,-0.001021,0.082186,-0.048697,-0.039549,-0.268181,-0.032156,-0.072809,-0.012023,0.044303,-0.032065,0.064048,0.013004,-0.097946,0.431814
"CFLOW(i, t)",0.12604,0.040285,-0.172965,1.0,-0.108007,-0.141034,0.020466,-0.0437,-0.039667,0.127036,-0.006674,0.217583,0.197268,-0.035307,-0.118604,0.043985,-0.078703,0.038246
"NWC(i, t)",-0.020107,-0.044682,-0.001021,-0.108007,1.0,0.080761,0.113033,0.007717,0.082724,0.095009,0.046071,-0.009767,-0.058869,0.223844,0.053649,-0.021066,-0.041746,-0.097769
"LEV(i, t)",-0.1715,0.108516,0.082186,-0.141034,0.080761,1.0,0.030843,0.001296,0.1439,0.018065,-0.034713,0.155608,0.006078,0.001281,0.098635,-0.033598,0.042058,-0.179781
"INDUSTRYRISK(i, t)",-0.005499,0.025822,-0.048697,0.020466,0.113033,0.030843,1.0,0.007024,0.016916,0.030165,-0.047829,-0.057351,0.018749,-0.03412,-0.039636,-0.044327,0.007796,-0.070483
"INCASSET(i, t)",-0.129217,0.083876,-0.039549,-0.0437,0.007717,0.001296,0.007024,1.0,-0.075092,0.033658,-0.018721,0.158011,-0.022319,0.009242,0.108009,0.064783,-0.01085,-0.088841
"INDUSTRYDEVIATION(i, t)",-0.541225,-0.05465,-0.268181,-0.039667,0.082724,0.1439,0.016916,-0.075092,1.0,-0.165088,0.050511,-0.024619,0.001466,-0.09542,-0.033081,-0.00396,0.499166,-0.441569
"CASH(i, t) - CASH(i, t-1)",0.250888,0.118793,-0.032156,0.127036,0.095009,0.018065,0.030165,0.033658,-0.165088,1.0,0.070754,0.281736,0.107037,0.220614,0.002228,0.033624,-0.474154,-0.483829


In [None]:
pval

Unnamed: 0,"CASH(i, t)","GROWTH(i, t)","SIZE(i, t)","CFLOW(i, t)","NWC(i, t)","LEV(i, t)","INDUSTRYRISK(i, t)","INCASSET(i, t)","INDUSTRYDEVIATION(i, t)","CASH(i, t) - CASH(i, t-1)","GROWTH(i, t) - GROWTH(i, t-1)","SIZE(i, t) - SIZE(i, t-1)","CFLOW(i, t) - CFLOW(i, t-1)","NWC(i, t) - NWC(i, t-1)","LEV(i, t) - LEV(i, t-1)","INCASSET(i, t) - INCASSET(i, t-1)","INDUSTRYDEVIATION(i, t) - INDUSTRYDEVIATION(i, t-1)","CASH(i, t-1)"
"CASH(i, t)",0.0,0.3313326,8.683252000000001e-73,2.333312e-06,0.4530215,1.136715e-10,0.83742,1.284015e-06,5.605220999999999e-107,4.316373e-16,0.97073,0.001972946,0.8292848,0.0004690864,0.007376092,0.088542,5.378854e-20,2.063549e-167
"GROWTH(i, t)",0.3313326,0.0,0.01108417,0.1326119,0.09528026,4.877728e-05,0.335184,0.001715936,0.04126334,0.0001441659,0.247267,5.701743000000001e-66,1.272735e-14,0.01023707,5.605945e-05,0.318915,0.1431478,0.1430045
"SIZE(i, t)",8.683252000000001e-73,0.01108417,0.0,7.849528e-11,0.9695962,0.002125694,0.069025,0.1398392,2.091332e-24,0.3051341,0.020979,0.7014663,0.1576047,0.3065001,0.0409414,0.678724,0.001746245,1.540991e-47
"CFLOW(i, t)",2.333312e-06,0.1326119,7.849528e-11,0.0,5.294141e-05,1.227871e-07,0.444999,0.1027873,0.1386546,4.768053e-05,0.832643,2.196726e-12,2.126462e-10,0.2601558,0.0001477622,0.161023,0.01196521,0.2225252
"NWC(i, t)",0.4530215,0.09528026,0.9695962,5.294141e-05,0.0,0.002539247,2.3e-05,0.7733636,0.001986468,0.002397257,0.144433,0.7554812,0.06031024,4.877934e-13,0.08695238,0.502184,0.1830126,0.001780413
"LEV(i, t)",1.136715e-10,4.877728e-05,0.002125694,1.227871e-07,0.002539247,0.0,0.249642,0.961435,6.741923e-08,0.5646123,0.271578,5.987867e-07,0.8463358,0.9674096,0.001619231,0.284424,0.1797542,7.51281e-09
"INDUSTRYRISK(i, t)",0.8374199,0.3351844,0.06902513,0.4449986,2.320986e-05,0.249642,0.0,0.7932402,0.527855,0.3360769,0.12971,0.06724921,0.5499701,0.276527,0.2061626,0.157787,0.8036986,0.02445077
"INCASSET(i, t)",1.284015e-06,0.001715936,0.1398392,0.1027873,0.7733636,0.961435,0.79324,0.0,0.005014484,0.2830816,0.553307,3.984517e-07,0.4766702,0.7682584,0.0005529197,0.038867,0.7294008,0.004538321
"INDUSTRYDEVIATION(i, t)",5.605220999999999e-107,0.04126334,2.091332e-24,0.1386546,0.001986468,6.741923e-08,0.527855,0.005014484,0.0,1.159007e-07,0.109526,0.4324291,0.962732,0.002294637,0.2914294,0.899636,2.5861e-65,7.131253e-50
"CASH(i, t) - CASH(i, t-1)",4.316373e-16,0.0001441659,0.3051341,4.768053e-05,0.002397257,0.5646123,0.336077,0.2830816,1.159007e-07,0.0,0.024894,4.781004e-20,0.0006206107,1.066285e-12,0.9433618,0.284045,3.029816e-58,6.550818e-61


# Section 4

## Question 16

### First Regression: Random Effects totalassets on GROWTH(i, t)

In [None]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df['totalassets'])
endog = df['GROWTH(i, t)']

# Fits the OLS to the data
model = RandomEffects(endog, exog)
model_fit = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit.summary)

                        RandomEffects Estimation Summary                        
Dep. Variable:           GROWTH(i, t)   R-squared:                        0.0044
Estimator:              RandomEffects   R-squared (Between):              0.0009
No. Observations:                1395   R-squared (Within):               0.0011
Date:                Sun, May 15 2022   R-squared (Overall):              0.0018
Time:                        22:35:25   Log-likelihood                   -1238.6
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      6.1972
Entities:                       13991   P-value                           0.0129
Avg Obs:                       0.0997   Distribution:                  F(1,1393)
Min Obs:                       0.0000                                           
Max Obs:                       4.0000   F-statistic (robust):             2.1080
                            

### Second Regression: Random Effects totalassets on CASH(i, t)

In [None]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df['totalassets'])
endog = df['CASH(i, t)']

# Fits the OLS to the data
model = RandomEffects(endog, exog)
model_fit = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit.summary)

                        RandomEffects Estimation Summary                        
Dep. Variable:             CASH(i, t)   R-squared:                        0.1532
Estimator:              RandomEffects   R-squared (Between):              0.1687
No. Observations:                1395   R-squared (Within):               0.1089
Date:                Sun, May 15 2022   R-squared (Overall):              0.1565
Time:                        22:35:25   Log-likelihood                   -7814.6
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      251.93
Entities:                       13991   P-value                           0.0000
Avg Obs:                       0.0997   Distribution:                  F(1,1393)
Min Obs:                       0.0000                                           
Max Obs:                       4.0000   F-statistic (robust):             130.65
                            

### Third Regression Random Effects totalDebt on GROWTH(i, t)

In [None]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df['totalDebt'])
endog = df['GROWTH(i, t)']

# Fits the OLS to the data
model = RandomEffects(endog, exog)
model_fit = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit.summary)

                        RandomEffects Estimation Summary                        
Dep. Variable:           GROWTH(i, t)   R-squared:                        0.0062
Estimator:              RandomEffects   R-squared (Between):              0.0023
No. Observations:                1395   R-squared (Within):               0.0032
Date:                Sun, May 15 2022   R-squared (Overall):              0.0037
Time:                        22:35:26   Log-likelihood                   -1236.4
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      8.6708
Entities:                       13991   P-value                           0.0033
Avg Obs:                       0.0997   Distribution:                  F(1,1393)
Min Obs:                       0.0000                                           
Max Obs:                       4.0000   F-statistic (robust):             4.8283
                            

### Fourth Regression Random Effects totalDebt on CASH(i, t)

In [None]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df['totalDebt'])
endog = df['CASH(i, t)']

# Fits the OLS to the data
model = RandomEffects(endog, exog)
model_fit = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit.summary)

                        RandomEffects Estimation Summary                        
Dep. Variable:             CASH(i, t)   R-squared:                        0.0749
Estimator:              RandomEffects   R-squared (Between):              0.0813
No. Observations:                1395   R-squared (Within):               0.0454
Date:                Sun, May 15 2022   R-squared (Overall):              0.0740
Time:                        22:35:26   Log-likelihood                   -7879.6
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      112.77
Entities:                       13991   P-value                           0.0000
Avg Obs:                       0.0997   Distribution:                  F(1,1393)
Min Obs:                       0.0000                                           
Max Obs:                       4.0000   F-statistic (robust):             56.376
                            

### Fifth Regression Random Effects SIZE(i, t) on CASH(i, t)

In [None]:
# Defines endogenous and exogenous variables
exog = sm.tools.tools.add_constant(df['SIZE(i, t)'])
endog = df['CASH(i, t)']

# Fits the OLS to the data
model = RandomEffects(endog, exog)
model_fit = model.fit(cov_type='clustered', cluster_entity=True)

# Shows regression results
print(model_fit.summary)

                        RandomEffects Estimation Summary                        
Dep. Variable:             CASH(i, t)   R-squared:                        0.1926
Estimator:              RandomEffects   R-squared (Between):              0.2208
No. Observations:                1395   R-squared (Within):               0.1224
Date:                Sun, May 15 2022   R-squared (Overall):              0.2072
Time:                        22:35:26   Log-likelihood                   -7795.4
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      332.24
Entities:                       13991   P-value                           0.0000
Avg Obs:                       0.0997   Distribution:                  F(1,1393)
Min Obs:                       0.0000                                           
Max Obs:                       4.0000   F-statistic (robust):             262.79
                            