In [1]:
# https://github.com/fnielsen/everything
from everything import *

In [2]:
# Read dataframe with features for companies
filename = expanduser('~/workspace/cvrminer/virksomheder-features.csv')
df = read_csv(filename, encoding='utf-8', index_col=0)

In [3]:
# Feature names
df.columns

Index([u'antal_penheder', u'branche_ansvarskode', u'nyeste_antal_ansatte',
       u'nyeste_virksomhedsform', u'reklamebeskyttet', u'sammensat_status',
       u'nyeste_statuskode', u'stiftelsesaar'],
      dtype='object')

In [4]:
# Functions for conversion to numerical dataframes
def to_dummies(df, column):
    datatype = df[column].dtypes
    if datatype in [int64, float64]:
        return df[[column]]
    elif datatype == bool:
        return df[[column]].astype(int)
    elif datatype == 'object':
        df_column = df[column].str.get_dummies()
        df_column.columns = [column + ":" + col for col in df_column.columns]
        return df_column
    else:
        raise ValueError('Unrecognized datatype for column {}'.format(column))

        
def dataframe_to_numerical(df):
    df_numerical = DataFrame(index=df.index)
    for column in df.columns:
        print(column)
        df_numerical = df_numerical.join(to_dummies(df, column))
    return df_numerical

In [5]:
# Numerical dataframe
dfn = dataframe_to_numerical(df)
dfn.shape

antal_penheder
branche_ansvarskode
nyeste_antal_ansatte
nyeste_virksomhedsform
reklamebeskyttet
sammensat_status
nyeste_statuskode
stiftelsesaar


(1529578, 86)

In [6]:
# Preprocessing
imputer = Imputer()
scaler = StandardScaler(with_mean=False)
dfni = DataFrame(scaler.fit_transform(imputer.fit_transform(dfn)), columns=dfn.columns, index=dfn.index)

In [7]:
df.nyeste_statuskode.value_counts()

None    1470472
3         37049
1         19948
5          1351
2           217
4           210
6           175
9           115
8            38
7             3
Name: nyeste_statuskode, dtype: int64

In [8]:
df.sammensat_status.value_counts()

Ophørt                             570763
Aktiv                              437184
NORMAL                             272911
OPLØSTEFTERKONKURS                  66128
TVANGSOPLØST                        58405
OPLØSTEFTERERKLÆRING                39246
OPLØSTEFTERFRIVILLIGLIKVIDATION     36951
OPLØSTEFTERFUSION                   20581
UNDERKONKURS                         8182
SLETTET                              7570
OPLØSTEFTERSPALTNING                 6467
UNDERTVANGSOPLØSNING                 2892
UNDERFRIVILLIGLIKVIDATION            1668
Fremtid                               460
UNDERREASSUMERING                      67
slettet                                36
UDENRETSVIRKNING                       30
UNDERREKONSTRUKTION                    11
AKTIV                                   8
OPLØST                                  6
Slettet                                 4
UNDERREASUMMERING                       3
SLETTES                                 2
OPLØSTEFTERTVANGSOPLØSNING        

In [9]:
pd.crosstab(df.sammensat_status, df.nyeste_statuskode)

nyeste_statuskode,1,2,3,4,5,6,7,8,9,None
sammensat_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AKTIV,0,0,0,0,0,0,0,0,0,8
Aktiv,179,31,447,12,39,5,0,4,13,436454
Fremtid,0,0,0,0,0,0,0,0,0,460
NORMAL,16,24,5,0,1,0,0,0,1,272864
OPLØST,0,0,0,0,0,0,0,0,0,6
OPLØSTEFTERERKLÆRING,4,3,0,0,0,0,0,0,2,39237
OPLØSTEFTERFRIVILLIGLIKVIDATION,5,6,0,1,6,0,0,0,3,36930
OPLØSTEFTERFUSION,13,5,0,0,1,0,0,0,2,20560
OPLØSTEFTERKONKURS,11646,9,34348,40,630,78,0,3,18,19356
OPLØSTEFTERSPALTNING,0,0,1,0,0,0,0,0,0,6466


In [10]:
indices = df.sammensat_status.isin(['Aktiv', u'OPLØSTEFTERKONKURS']).values
dfs = df.ix[indices, :].copy()
dfs.shape

(503312, 8)

In [11]:
dfs['konkurs'] = (dfs.sammensat_status == u'OPLØSTEFTERKONKURS').astype(int)

In [12]:
def transform_year(year):
    return year - 2000


results = smf.glm(('konkurs ~ np.log(antal_penheder+1) + C(nyeste_antal_ansatte) + ' 
                   # nyeste_virksomhedsform + nyeste_statuskode + 
                   'branche_ansvarskode + reklamebeskyttet + transform_year(stiftelsesaar)'),
                  data=dfs, family=sm.families.Binomial()).fit()

In [13]:
print(results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                konkurs   No. Observations:               268243
Model:                            GLM   Df Residuals:                   268222
Model Family:                Binomial   Df Model:                           20
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -75790.
Date:                Thu, 18 Aug 2016   Deviance:                   1.5158e+05
Time:                        22:30:48   Pearson chi2:                 2.97e+05
No. Iterations:                    26                                         
                                        coef    std err          z      P>|z|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------------------
Intercept                            -0.1821      0.187     -0.976      0.329        

In [14]:
actives = dfs.sammensat_status.isin(['Aktiv']).values
y_est = results.predict(exog=dfs.ix[actives, :])
indices = argsort(-y_est)

In [15]:
# dfs.ix[actives, :].iloc[indices, :].head(100)