In [1]:
import pandas as pd
import numpy as np

from linearmodels import PooledOLS, PanelOLS
from linearmodels.panel import RandomEffects, compare
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
defaults = pd.read_csv('data/data_defaults.csv', index_col=0, names=['REGN_GKO', 'DATE', 'DEFAULT'],
                      skiprows=1)
defaults.DATE = pd.to_datetime(defaults.DATE, format='%Y%m%d')
regressors = pd.read_csv('data/final_data.csv', index_col=0)
regressors.DATE = pd.to_datetime(regressors.DATE, format='%Y-%m-%d')

In [3]:
final = regressors.merge(defaults, how='left', on=['REGN_GKO', 'DATE']).set_index(['REGN_GKO', 'DATE'])
final.DEFAULT = final.DEFAULT.fillna(0)

final.DEFAULT = final.DEFAULT + final.DEFAULT.shift(-1) + final.DEFAULT.shift(-2) + final.DEFAULT.shift(-3) + final.DEFAULT.shift(-4)
final.DEFAULT = final.DEFAULT.fillna(0)

final = final.fillna(method='ffill').dropna()

In [4]:
final.replace([np.inf, -np.inf], np.nan, inplace=True)

In [5]:
final.to_csv('final.csv')

In [6]:
X = sm.add_constant(final.drop(columns='DEFAULT').fillna(method='ffill'))
y = final.DEFAULT

In [7]:
lm_pooled = PooledOLS(y, X).fit(cov_type="clustered")
print(lm_pooled.summary)

                          PooledOLS Estimation Summary                          
Dep. Variable:                DEFAULT   R-squared:                        0.0195
Estimator:                  PooledOLS   R-squared (Between):              0.1227
No. Observations:                1814   R-squared (Within):              -0.0095
Date:                Mon, Jun 03 2024   R-squared (Overall):              0.0195
Time:                        08:48:47   Log-likelihood                    642.70
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      3.5936
Entities:                          91   P-value                           0.0001
Avg Obs:                       19.934   Distribution:                 F(10,1803)
Min Obs:                       14.000                                           
Max Obs:                       20.000   F-statistic (robust):             5.4843
                            

In [8]:
model = sm.Logit(y, X).fit(cov_type='HC0')

# Выводим результаты
print(model.summary())

         Current function value: 0.129842
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                DEFAULT   No. Observations:                 1814
Model:                          Logit   Df Residuals:                     1803
Method:                           MLE   Df Model:                           10
Date:                Mon, 03 Jun 2024   Pseudo R-squ.:                 0.04423
Time:                        08:48:48   Log-Likelihood:                -235.53
converged:                      False   LL-Null:                       -246.44
Covariance Type:                  HC0   LLR p-value:                   0.01615
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const            -24.5901      0.433    -56.765      0.000     -25.439     -23.741
MAX_NORM          -0.0017      0.001     -1.184      

  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


In [10]:
import collections.abc
collections.Iterable = collections.abc.Iterable
import pylogit as pl

In [14]:
# The 'alt_id_column' is the name of a column to be created in the long-format data
# It will identify the alternative associated with each row.
alt_id_column = "DATE"

# The "obs_id_column" is a custom id column that ignores the fact that this is a 
# panel/repeated-observations dataset. This column denotes each individual choice
# situation.
obs_id_column = "REGN_GKO"

# The "choice_column" records the name of the column that denotes whether or not each
# individual chose the alternative on a given row.
choice_column = "DEFAULT"

In [24]:
from collections import OrderedDict
basic_specification = OrderedDict()

basic_specification['MAX_NORM'] = 1 
basic_specification['CAP_TO_NA'] = 1  
basic_specification['RES_TO_NA'] = 1
basic_specification['NA'] = 1
basic_specification['BI_TO_NA'] = 1
basic_specification['NI_TO_NA'] = 1
basic_specification['L_NORM'] = 1
basic_specification['I_NORM'] = 1
basic_specification['IS_SZKO'] = 1
basic_specification['REGISTERED_MSC'] = 1

In [25]:
basic_names = OrderedDict()

basic_names["MAX_NORM"] = 'Н7'
basic_names["CAP_TO_NA"] = 'Капитал к резервам'
basic_names["RES_TO_NA"] = 'Резервы к чистым активам'
basic_names["NA"] = 'Чистые активы'
basic_names["BI_TO_NA"] = 'Балансовая прибыль к чистым активам'
basic_names["NI_TO_NA"] = 'Чистая прибыль к чистым активам'
basic_names["L_NORM"] = 'Н3'
basic_names["I_NORM"] = 'Н10.1'
basic_names["IS_SZKO"] = 'СЗКО'
basic_names["REGISTERED_MSC"] = 'Рег Москва'
basic_names["DEFAULT"] = 'Дефолт'

In [28]:
logit = pl.create_choice_model(data=final.reset_index().fillna(method='ffill'),
                               alt_id_col=alt_id_column,
                               obs_id_col=obs_id_column,
                               choice_col=choice_column,
                               specification=basic_specification,
                               model_type="MNL",
                               names=basic_names)

TypeError: specification_dict[MAX_NORM] must be 'all_same', 'all_diff', or a list.

In [None]:
logit.fit_mle(np.zeros(14))

# Look at the estimation results
logit.get_statsmodels_summary()