In [4]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import sys

In [5]:
path = Path(os.getcwd())

In [6]:
base_dir = path.parent.parent

In [7]:
#Windows format
data_in = os.path.join(str(base_dir) , "da_data_repo\\bisnode-firms\\clean\\")
data_out = os.path.join(str(base_dir) , "da_data_repo\\bisnode-firms\\")
output = os.path.join(data_out,'output')
func = os.path.join(str(base_dir) ,   "da_case_studies\\ch00-tech-prep\\")

In [8]:
###########################################################
# Import data
###########################################################

csv_path = os.path.join(data_in,"cs_bisnode_panel.csv")
data = pd.read_csv(csv_path)
# drop variables with many NAs
data = data.drop(columns = ['COGS', 'finished_prod', 'net_dom_sales', 'net_exp_sales', 'wages'])
data = data.query('year !=2016')

In [9]:
###########################################################
# label engineering
###########################################################

# add all missing year and comp_id combinations -
# originally missing combinations will have NAs in all other columns
data = data.set_index(['year','comp_id']).unstack(fill_value ='toReplace').stack().reset_index()
data = data.replace('toReplace',np.nan) #only way I could define it as NaN

In [10]:
# generate status_alive; if sales larger than zero and not-NA, then firm is alive
data['status_alive'] = (data['sales'] > 0 & (False == data['sales'].isna())).astype(int)

In [11]:
# defaults in two years if there are sales in this year but no sales two years later

#Status_in_two_years: data.groupby('comp_id')['status_alive'].shift(-2)
data['default'] = ((data['status_alive'] == 1) & (data.groupby('comp_id')['status_alive'].shift(-2) == 0)).astype(int)

In [12]:
data = data.query('year <= 2013')

In [13]:
data['default'].describe()
data['sales'].describe()

count    2.151190e+05
mean     4.860478e+05
std      3.885493e+06
min     -1.472559e+07
25%      5.344444e+03
50%      3.055555e+04
75%      1.083537e+05
max      1.110294e+08
Name: sales, dtype: float64

In [14]:
data['sales']=np.where(data['sales'] < 0, 1, data['sales'])#has to be outside of assin()
data = data.assign(
    ln_sales = np.where(data['sales'] > 0 , np.log(data['sales']), (np.where(data['sales'].isna(),np.nan,0))), #NaN remain NaN
    sales_mil=data['sales']/1000000,
    sales_mil_log = np.where(data['sales'] > 0, np.log(data['sales']/1000000), (np.where(data['sales'].isna(),np.nan,0)))
  )

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [31]:
data['d1_sales_mil_log']=data['sales_mil_log']-data.groupby('comp_id')['sales_mil_log'].shift(1)

In [32]:
# replace w 0 for new firms + add dummy to capture it
data['age']=np.where(data['year']-data['founded_year'] < 0, 0, data['year']-data['founded_year'])
data['new']=np.where(((data['age']<=1)|(data['balsheet_notfullyear']==1)),1,(np.where(data['age'].isna(),np.nan,0))) 
data['d1_sales_mil_log']=np.where(data['new']==1 , 0, np.where(data['new'].isna(),np.nan,data['d1_sales_mil_log']))
data['new']=np.where(data['d1_sales_mil_log'].isna(),1,data['new'] )
data['d1_sales_mil_log']=np.where(data['d1_sales_mil_log'].isna(),0,data['d1_sales_mil_log'])

In [34]:
#comp_id 180325105664 d1 set to NaN if new==NaN

In [36]:
###########################################################
 # sample design
###########################################################

# look at cross section
data=data.query('year==2012 & status_alive == 1')
# look at firms below 10m euro revenues and above 1000 euros
data=data.query('sales_mil<=10 & sales_mil>=0.001')

In [45]:
data['default'].describe()

count    21723.000000
mean         0.205681
std          0.404207
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: default, dtype: float64

In [46]:
data.to_csv(data_out+"work5.csv",index=False)