# Outlier detection

In [1]:
# https://github.com/fnielsen/everything
from everything import *

In [2]:
# Read dataframe with features for companies
filename = expanduser('~/workspace/cvrminer/virksomheder-features.csv')
df = read_csv(filename, encoding='utf-8', index_col=0)

In [3]:
# Feature names
df.columns

Index([u'antal_penheder', u'branche_ansvarskode', u'nyeste_antal_ansatte',
       u'nyeste_virksomhedsform', u'reklamebeskyttet', u'sammensat_status',
       u'sidste_virksomhedsstatus', u'stiftelsesaar'],
      dtype='object')

In [4]:
# Functions for conversion to numerical dataframes
def to_dummies(df, column):
    datatype = df[column].dtypes
    if datatype in [int64, float64]:
        return df[[column]]
    elif datatype == bool:
        return df[[column]].astype(int)
    elif datatype == 'object':
        df_column = df[column].str.get_dummies()
        df_column.columns = [column + ":" + col for col in df_column.columns]
        return df_column
    else:
        raise ValueError('Unrecognized datatype for column {}'.format(column))
        
def dataframe_to_numerical(df):
    df_numerical = DataFrame(index=df.index)
    for column in df.columns:
        print(column)
        df_numerical = df_numerical.join(to_dummies(df, column))
    return df_numerical

In [5]:
# Numerical dataframe
dfn = dataframe_to_numerical(df)
dfn.shape

antal_penheder
branche_ansvarskode
nyeste_antal_ansatte
nyeste_virksomhedsform
reklamebeskyttet
sammensat_status
sidste_virksomhedsstatus
stiftelsesaar


(1521456, 98)

In [6]:
dfn.describe()



Unnamed: 0,antal_penheder,branche_ansvarskode:0,branche_ansvarskode:15,branche_ansvarskode:65,branche_ansvarskode:75,branche_ansvarskode:76,branche_ansvarskode:96,branche_ansvarskode:97,branche_ansvarskode:99,branche_ansvarskode:None,...,sidste_virksomhedsstatus:UNDER FRIVILLIG LIKVIDATION,sidste_virksomhedsstatus:UNDER KONKURS,sidste_virksomhedsstatus:UNDER REASSUMERING,sidste_virksomhedsstatus:UNDER REASSUMMERING,sidste_virksomhedsstatus:UNDER REASUMMATION,sidste_virksomhedsstatus:UNDER REASUMMERING,sidste_virksomhedsstatus:UNDER REKONSTRUKTION,sidste_virksomhedsstatus:UNDER TVANGSOPLØSNING,sidste_virksomhedsstatus:slettet,stiftelsesaar
count,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,...,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,1521456.0,1517460.0
mean,1.002027,0.001102891,0.001934989,0.004729023,0.0006730395,1.31453e-06,0.0001557718,6.046839e-05,0.0001097633,0.9912327,...,0.001096976,0.005636706,8.938806e-05,6.572651e-07,6.572651e-07,2.629061e-06,7.229917e-06,0.001912642,2.366155e-05,1999.948
std,2.358024,0.03319149,0.04394594,0.06860512,0.02593428,0.001146529,0.01247989,0.00777591,0.01047623,0.09322232,...,0.03310246,0.07486613,0.009454106,0.0008107189,0.0008107189,0.001621436,0.002688842,0.04369193,0.004864257,14.32302
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1197.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
max,1323.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2016.0


In [7]:
# Preprocessing
imputer = Imputer()
scaler = StandardScaler(with_mean=False)
dfni = scaler.fit_transform(imputer.fit_transform(dfn))

In [8]:
# Outlier detection/novelty detection with K-means clustering
clusterer = MiniBatchKMeans(n_clusters=8, verbose=True)
clusterer.fit(dfni)

  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, init_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.batch_size)
  0, n_samples - 1, self.

Init 1/3 with method: k-means++
Inertia for init 1/3: 17680.132072
Init 2/3 with method: k-means++
Inertia for init 2/3: 17822.679048
Init 3/3 with method: k-means++
Inertia for init 3/3: 17328.389567
Minibatch iteration 1/1521500: mean batch inertia: 57.735447, ewa inertia: 57.735447 
Minibatch iteration 2/1521500: mean batch inertia: 57.171227, ewa inertia: 57.735372 
Minibatch iteration 3/1521500: mean batch inertia: 41.496874, ewa inertia: 57.733238 
Minibatch iteration 4/1521500: mean batch inertia: 146.608975, ewa inertia: 57.744921 
Minibatch iteration 5/1521500: mean batch inertia: 40.094582, ewa inertia: 57.742601 
Minibatch iteration 6/1521500: mean batch inertia: 27.508770, ewa inertia: 57.738626 
Minibatch iteration 7/1521500: mean batch inertia: 74.640563, ewa inertia: 57.740848 
Minibatch iteration 8/1521500: mean batch inertia: 38.592726, ewa inertia: 57.738331 
Minibatch iteration 9/1521500: mean batch inertia: 42.478083, ewa inertia: 57.736325 
[MiniBatchKMeans] Reassi

MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=8,
        n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=True)

In [9]:
distances = sum((dfni - clusterer.cluster_centers_[clusterer.labels_, :]) ** 2, axis=1)
indices_clusterer = argsort(-distances)

In [10]:
dfn.iloc[indices_clusterer[:20], :]

Unnamed: 0_level_0,antal_penheder,branche_ansvarskode:0,branche_ansvarskode:15,branche_ansvarskode:65,branche_ansvarskode:75,branche_ansvarskode:76,branche_ansvarskode:96,branche_ansvarskode:97,branche_ansvarskode:99,branche_ansvarskode:None,...,sidste_virksomhedsstatus:UNDER FRIVILLIG LIKVIDATION,sidste_virksomhedsstatus:UNDER KONKURS,sidste_virksomhedsstatus:UNDER REASSUMERING,sidste_virksomhedsstatus:UNDER REASSUMMERING,sidste_virksomhedsstatus:UNDER REASUMMATION,sidste_virksomhedsstatus:UNDER REASUMMERING,sidste_virksomhedsstatus:UNDER REKONSTRUKTION,sidste_virksomhedsstatus:UNDER TVANGSOPLØSNING,sidste_virksomhedsstatus:slettet,stiftelsesaar
cvr_nummer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15706538,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1992.0
20899301,1,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1998.0
21976415,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1999.0
35852492,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2014.0
31086477,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2007.0
36467223,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2014.0
56799559,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1985.0
27255116,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,2003.0
29219508,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,2006.0
36073888,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2014.0


In [11]:
filename = expanduser('~/workspace/cvrminer/virksomheder-report.html')
with codecs.open(filename, 'w', encoding='utf-8') as f:
    f.write("""
<html>
  <head>
    <title>Virksomheder report</title>
    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
  </head>
  <body>
    <h1>Virksomheder report</h1>
    
""")
    
    f.write("""<h2>Numerical features dataframe summary statistics</h2>""")
    f.write(dfn.describe().T.to_html())
    f.write("""<h2>Novelty from k-means</h2>""")
    f.write(dfn.iloc[indices_clusterer[:50], :].to_html(
            escape=False,
            formatters={'__index__': 
                        lambda idx: '<a href="http://datacvr.virk.dk/data/visenhed?enhedstype=virksomhed&id={}">{}</a>'.format(
                     idx, idx)}))
    f.write("""
  </body>
</html>""")

In [None]:
# Investigate cluster model as a function of number of clusters
inertias = []
for n_clusters in range(1, 50):
    clusterer = MiniBatchKMeans(n_clusters=n_clusters, max_iter=200, max_no_improvement=30, n_init=10)
    clusterer.fit(dfni)
    inertias.append(clusterer.inertia_)

In [None]:
plot(inertias)
show()

In [None]:
# Outlier detection with One-class-SVM
one_class_svm = OneClassSVM()
one_class_svm.fit(dfni)

In [None]:
decisions = one_class_svm.decision_function(dfni)

In [None]:
indices = argsort(decisions, axis=0)[:, 0]

In [None]:
dfn.iloc[indices[:30], :]

In [None]:
help(dfn.iloc[indices_clusterer[:20], :].to_html)