# Financial Econometrics II: Final Project

*By Basri Satiroglu, Daniel Deutsch and José Lucas Barretto*

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Matplotlib styles
plt.style.use('ggplot')
plt.rcParams.update({
    'figure.figsize': (15, 7),
    'axes.prop_cycle': plt.cycler(color=['#4C72B0', '#C44E52', '#55A868', '#8172B2', '#CCB974', '#64B5CD']),
    'axes.facecolor': '#EAEAF2'
})

## Section 1

### Questions 1 and 2

In [3]:
### TRIES LOADING PRE PROCESSED CSV, IF NOT AVAILABLE PRE-PROCESSES RAW DATA

try:
    # read csv file
    df = pd.read_csv('dataset.csv')

    # set siren and date as indexes
    df = df.sort_values(['siren', 'year']).set_index(['siren', 'year'], verify_integrity=True)


except:
    # read raw dataset
    df_raw = pd.read_stata('Diane-data.dta')

    # rename columns
    df_raw.rename(
        columns = {
            'siren': 'siren_float',
            'sirenID': 'siren',
            'companyname': 'company',
            'regioncode': 'region_code',
            'départementcode': 'department_code',
            'cashNbank': 'cash',
            'totalDebt': 'totaldebt'
        },
        inplace=True
    )

    # select desired columns from data
    columns = [
        'siren', 'year', 'company', 'totalassets', 'marketable_sec', 'cash', 
        'totaldebt', 'inventory', 'creditors', 'tradedebts', 'taxes', 'otherdebts', 
        'netincome', 'netsales'
    ]

    # filter desired department and columns
    df = df_raw.loc[df_raw['department_code'] == 33, columns]

    # drop na values
    df.dropna(inplace=True)

    # set siren and date as indexes
    df = df.sort_values(['siren', 'year']).set_index(['siren', 'year'], verify_integrity=True)

    df.to_csv('dataset.csv')

### Question 3

In [4]:
# create filters for numerator and denominator
mask_num = (df['cash'] + df['marketable_sec']) >= 0
mask_den = df['totalassets'] >= 1

# compute ratio
df['cash_mktsec_ratio'] = ((df['cash'] + df['marketable_sec'])[mask_num]) / (df['totalassets'][mask_den])

# compute lag and check that 1st column is nan
df['cash_mktsec_ratio_lag'] = df['cash_mktsec_ratio'].groupby('siren').shift()
df['cash_mktsec_ratio_lag'].head(10)

siren      year
58806647   2014         NaN
           2015    0.023810
           2016    0.483010
           2017    0.252500
87280285   2014         NaN
           2015    0.020430
           2016    0.027053
           2017    0.028972
           2018    0.079347
300155207  2014         NaN
Name: cash_mktsec_ratio_lag, dtype: float64

### Question 4

#### Computing Control Vars

In [5]:
# create filter for ratios
mask = (df['cash'] >= 0) & (df['marketable_sec'] >= 0) & (df['totaldebt'] >= 0)

# calculate several ratios and control vars
df['nwc'] = (df['inventory'] + df['creditors'] - df['tradedebts'] - df['taxes'] - df['otherdebts'])/df['totalassets']
df['salesgrowth'] = df.groupby('siren')['netsales'].pct_change()
df['log_totalassets'] = np.log(df['totalassets'])[mask]
df['netincome_ratio'] = (df['netincome']/df['totalassets'])[mask]
df['totaldebt_ratio'] = (df['totaldebt']/df['totalassets'])[mask]
df['inventory_ratio'] = df['inventory']/df['totalassets']  
df['creditors_ratio'] = df['creditors']/df['totalassets']  
df['tradedebts_ratio'] = df['tradedebts']/df['totalassets']  
df['taxes_ratio'] = df['taxes']/df['totalassets']  
df['otherdebts_ratio'] = df['otherdebts']/df['totalassets']  

  result = getattr(ufunc, method)(*inputs, **kwargs)


#### Between and Within Transform

In [6]:
between = df.groupby(level=0).mean()
within = df.subtract(df.groupby(level=0).mean(), level=0)

## Section 2

### Question 5

In [7]:
table = df.groupby('year')['cash_mktsec_ratio'].describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])
table.loc['2014-2018'] = df['cash_mktsec_ratio'].describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])

table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
        },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014,9899,0.220602,0.230617,0.14,0.002872,0.571429
2015,9099,0.221333,0.229691,0.142857,0.002779,0.580415
2016,7202,0.221851,0.231948,0.139818,0.002416,0.579068
2017,6821,0.217504,0.23191,0.135447,0.002778,0.575758
2018,2375,0.225615,0.237866,0.140843,0.00307,0.593278
2014-2018,35396,0.220783,0.231389,0.139323,0.002765,0.578193


### Question 6

#### Cash Holdings per Year for Within Transform

In [8]:
table = within.groupby('year')['cash_mktsec_ratio'].describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])
table.loc['2014-2018'] = within['cash_mktsec_ratio'].describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])

table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
        },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014,9899,-0.005128843,0.076763,0.0,-0.079631,0.061767
2015,9099,8.54034e-05,0.074225,0.0,-0.06894,0.069822
2016,7202,0.002471698,0.076181,0.0,-0.068405,0.077333
2017,6821,0.003911428,0.082459,0.0,-0.070231,0.083969
2018,2375,0.002320972,0.084402,0.0,-0.079179,0.08948
2014-2018,35396,2.070141e-19,0.077747,0.0,-0.07275,0.073746


#### Cash Holdings per Year for First Difference

In [9]:
table = (df.groupby('siren')['cash_mktsec_ratio'].diff()).groupby('year').describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])
table.loc['2014-2018'] = (df.groupby('siren')['cash_mktsec_ratio'].diff()).describe(percentiles=[0.1, 0.9]).drop(columns=['min', 'max'])

table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
        },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014,0,,,,,
2015,7354,0.00618,0.12509,0.000334,-0.109797,0.126302
2016,6103,0.00514,0.134563,5.1e-05,-0.118178,0.130784
2017,5833,0.003734,0.131548,0.000512,-0.123716,0.126068
2018,2109,0.000467,0.126838,-1.4e-05,-0.120557,0.120577
2014-2018,21399,0.004654,0.129787,7.6e-05,-0.116993,0.126992


### Question 7

In [10]:
stat_cols = [
    'cash_mktsec_ratio', # CASH
    'salesgrowth', # GROWTH
    'log_totalassets', # SIZE
    'netincome_ratio', # CFLOW
    'nwc', #NWC
    'totaldebt_ratio', #LEV
    # INDUSTRYRISK, 
    # INCASSET, 
    # INDUSTRYDEVIATION
]

stats = df[stat_cols].replace([np.inf, -np.inf], np.nan).dropna()
table = stats.describe(percentiles=[0.1, 0.9]).T.drop(columns=['min', 'max'])
table.index.names = ['Year']
table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
        },
    index={
        'cash_mktsec_ratio': 'CASH',
        'salesgrowth': 'GROWTH',
        'log_totalassets': 'SIZE',
        'netincome_ratio': 'CFLOW',
        'nwc': 'NWC',
        'totaldebt_ratio': 'LEV'
    },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CASH,20698,0.217128,0.225137,0.138946,0.003229,0.5625
GROWTH,20698,0.314475,6.449689,0.029906,-0.206813,0.422679
SIZE,20698,6.835551,1.726346,6.745236,4.70048,9.038555
CFLOW,20698,0.046709,0.291532,0.048603,-0.050054,0.186131
NWC,20698,-0.134682,3.264332,-0.075748,-0.6,0.400833
LEV,20698,0.625628,3.272601,0.556593,0.194327,0.930563


### Question 8

#### Descriptive Stats for Within Transform

In [11]:
stats = within[stat_cols].replace([np.inf, -np.inf], np.nan).dropna()
table = stats.describe(percentiles=[0.1, 0.9]).T.drop(columns=['min', 'max'])
table.index.names = ['Year']
table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
        },
    index={
        'cash_mktsec_ratio': 'CASH',
        'salesgrowth': 'GROWTH',
        'log_totalassets': 'SIZE',
        'netincome_ratio': 'CFLOW',
        'nwc': 'NWC',
        'totaldebt_ratio': 'LEV'
    },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CASH,20596,0.002708,0.078295,-0.000301,-0.073357,0.083122
GROWTH,20596,-2.9e-05,4.493253,0.0,-0.219098,0.196699
SIZE,20596,0.031963,0.210891,0.013148,-0.149394,0.23781
CFLOW,20596,0.000601,0.229296,0.000497,-0.0711,0.075821
NWC,20596,-0.004668,1.099066,0.002203,-0.115108,0.128325
LEV,20596,-0.000538,1.091462,-0.00651,-0.096748,0.076006


#### Descriptive Stats for Between Transform

In [12]:
stats = between[stat_cols].replace([np.inf, -np.inf], np.nan).dropna()
table = stats.describe(percentiles=[0.1, 0.9]).T.drop(columns=['min', 'max'])
table.index.names = ['Year']
table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
        },
    index={
        'cash_mktsec_ratio': 'CASH',
        'salesgrowth': 'GROWTH',
        'log_totalassets': 'SIZE',
        'netincome_ratio': 'CFLOW',
        'nwc': 'NWC',
        'totaldebt_ratio': 'LEV'
    },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CASH,9329,0.220927,0.212762,0.152554,0.012185,0.542961
GROWTH,9329,0.389475,5.387274,0.041379,-0.135731,0.450506
SIZE,9329,6.572534,1.675197,6.513826,4.47559,8.698779
CFLOW,9329,0.047322,0.17386,0.049157,-0.04414,0.169145
NWC,9329,-0.151885,3.238303,-0.085775,-0.599089,0.37795
LEV,9329,0.655245,3.253803,0.574809,0.219766,0.948207


#### Descriptive Stats for First Difference

In [13]:
stats = df[stat_cols].diff().replace([np.inf, -np.inf], np.nan).dropna()
table = stats.describe(percentiles=[0.1, 0.9]).T.drop(columns=['min', 'max'])
table.index.names = ['Year']
table.rename(
    columns={
        'count': 'Obs',
        'mean': 'Mean',
        'std': 'Std. Dev.',
        '10%': 'Perc. 10',
        '50%': 'Median',
        '90%': 'Perc. 90',
        },
    index={
        'cash_mktsec_ratio': 'CASH',
        'salesgrowth': 'GROWTH',
        'log_totalassets': 'SIZE',
        'netincome_ratio': 'CFLOW',
        'nwc': 'NWC',
        'totaldebt_ratio': 'LEV'
    },
    inplace=True
)
table.index.names = ['Year']
table['Obs'] = table['Obs'].astype('int')
table[['Obs', 'Mean', 'Std. Dev.', 'Median', 'Perc. 10', 'Perc. 90']]

Unnamed: 0_level_0,Obs,Mean,Std. Dev.,Median,Perc. 10,Perc. 90
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CASH,11295,0.002916,0.11983,6.8e-05,-0.114145,0.119579
GROWTH,11295,-0.138389,9.250355,0.0,-0.438977,0.384671
SIZE,11295,0.059406,0.284028,0.039155,-0.167841,0.313466
CFLOW,11295,-0.000333,0.456109,4e-05,-0.104372,0.103539
NWC,11295,0.005252,0.423184,0.002748,-0.171017,0.189789
LEV,11295,-0.007686,0.349078,-0.009854,-0.123718,0.109092


### Question 9

In [14]:
variances = {}

variances['Cash'] = df['cash'].var()
variances['Cash Within'] = within['cash'].var()
variances['Cash Between'] = between['cash'].var()

variances

{'Cash': 16230527.069724683,
 'Cash Within': 5414667.33430649,
 'Cash Between': 9615836.046729501}

### Question 10

#### Within Transform

In [15]:
within[stat_cols].corr()

Unnamed: 0,cash_mktsec_ratio,salesgrowth,log_totalassets,netincome_ratio,nwc,totaldebt_ratio
cash_mktsec_ratio,1.0,-0.001449,0.01418,-0.006578,0.005161,0.007797
salesgrowth,-0.001449,1.0,0.025722,-0.000695,-0.001422,0.005155
log_totalassets,0.01418,0.025722,1.0,0.088236,0.129879,-0.137889
netincome_ratio,-0.006578,-0.000695,0.088236,1.0,0.015577,-0.113361
nwc,0.005161,-0.001422,0.129879,0.015577,1.0,-0.327857
totaldebt_ratio,0.007797,0.005155,-0.137889,-0.113361,-0.327857,1.0


#### Between Transform

In [16]:
between[stat_cols].corr()

Unnamed: 0,cash_mktsec_ratio,salesgrowth,log_totalassets,netincome_ratio,nwc,totaldebt_ratio
cash_mktsec_ratio,1.0,-0.017653,-0.246273,-0.012142,-0.020908,0.029853
salesgrowth,-0.017653,1.0,0.005679,-0.025447,0.004046,-0.00079
log_totalassets,-0.246273,0.005679,1.0,0.069432,0.058905,-0.084088
netincome_ratio,-0.012142,-0.025447,0.069432,1.0,0.134633,-0.406359
nwc,-0.020908,0.004046,0.058905,0.134633,1.0,-0.47798
totaldebt_ratio,0.029853,-0.00079,-0.084088,-0.406359,-0.47798,1.0
