# Problem analytics

## This notebook is very similar to the 'Train Model' notebook

While the 'Train Model' should stay in a clean (always working) state, this notebook can be used to troubleshoot single runs.

Multiple code snippets are copy pasted from the other files

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
from learn import ModelTrainer
from collection import Collection

In [2]:
try:
    collection
except NameError:
    collection = Collection()

# Part 1: Setup Helpers (manual)

## Persist current collection

In [3]:
collection.to_file('dbs/tune.json')

## Restore from file (careful)

In the next box you can restore a list of runs from a json file

In [4]:
collection.import_file('dbs/tune.json', force=True)

## Reset model cache

This needs to be done when a new query should be done e.g. anbieter has changed

Maybe implement cache invalidation in ModelTrainer

In [5]:
trainer.resetSQLData()

NameError: name 'trainer' is not defined

## set config

In [6]:
trainer.config = config

NameError: name 'config' is not defined

## Set attributes in trainer

# Part 2: Config

In [3]:
cleanData = lambda x: x
# Prepare Attributes
#def cleanData(df, filters):
#    if 'gatt_wto' in filters:
#        df[['gatt_wto']] = df[['gatt_wto']].applymap(ModelTrainer.unifyYesNo)
#    if 'anzahl_angebote' in filters:
#        df[['anzahl_angebote']] = df[['anzahl_angebote']].applymap(ModelTrainer.tonumeric)
#    if 'teilangebote' in filters:
#        df[['teilangebote']] = df[['teilangebote']].applymap(ModelTrainer.unifyYesNo)
#    if 'lose' in filters:
#        df[['lose']] = df[['lose']].applymap(ModelTrainer.unifyYesNo)
#    if 'varianten' in filters:
#        df[['varianten']] = df[['varianten']].applymap(ModelTrainer.unifyYesNo)
#    if 'auftragsart_art' in filters:
#        auftrags_art_df = pd.get_dummies(df['auftragsart_art'], prefix='aftrgsrt',dummy_na=True)
#        df = pd.concat([df,auftrags_art_df],axis=1).drop(['auftragsart_art'],axis=1)
#    if 'sprache' in filters:
#        sprache_df = pd.get_dummies(df['sprache'], prefix='lang',dummy_na=True)
#        df = pd.concat([df,sprache_df],axis=1).drop(['sprache'],axis=1)
#    if 'auftragsart' in filters:
#        auftragsart_df = pd.get_dummies(df['auftragsart'], prefix='auftr',dummy_na=True)
#        df = pd.concat([df,auftragsart_df],axis=1).drop(['auftragsart'],axis=1)
#    if 'beschaffungsstelle_plz' in filters:
#        plz_df = pd.get_dummies(df['beschaffungsstelle_plz'], prefix='beschaffung_plz',dummy_na=True)
#        df = pd.concat([df,plz_df],axis=1).drop(['beschaffungsstelle_plz'],axis=1)
#    return df

In [4]:
select_anbieter = (
    # "anbieter.anbieter_id, "
    "anbieter.institution as anbieter_institution, "
    "cpv_dokument.cpv_nummer as anbieter_cpv, "
    "ausschreibung.meldungsnummer"
)
# anbieter_CPV are all the CPVs the Anbieter ever won a procurement for. So all the CPVs they are interested in. 
select_ausschreibung = (
    # "anbieter.anbieter_id, "
    "auftraggeber.institution as beschaffungsstelle_institution, "
    "auftraggeber.beschaffungsstelle_plz, "
    "ausschreibung.gatt_wto, "
    "ausschreibung.sprache, "
    "ausschreibung.auftragsart, "
    "ausschreibung.auftragsart_art, "
    "ausschreibung.lose, "
    "ausschreibung.teilangebote, "
    "ausschreibung.varianten, "
   # "ausschreibung.titel, "
    "ausschreibung.bietergemeinschaft, "
    "cpv_dokument.cpv_nummer as ausschreibung_cpv, "
    "ausschreibung.meldungsnummer as meldungsnummer2"
)

In [5]:
config = {
    # ratio that the positive and negative responses have to each other
    'positive_to_negative_ratio': 0.5,
    # Percentage of training set that is used for testing (Recommendation of at least 25%)
    'test_size': 0.25,
    'runs': 100,
    #'enabled_algorithms': ['random_forest'],
    # 'skip_cross_val': True,
    'enabled_algorithms': ['random_forest', 'decision_tree', 'gradient_boost'],
    'random_forest': {
        # Tune Random Forest Parameter
        'n_estimators': 400,
        'max_features': 'sqrt',
        'max_depth': None,
        'min_samples_split': 2
    },
    'decision_tree': {
        'max_depth': 6,
        'max_features': 'sqrt'
    },
    'gradient_boost': {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 6,
        'max_features': 'sqrt'
    }
}

In [7]:
try:
    trainer
except NameError:
    trainer = ModelTrainer(select_anbieter, select_ausschreibung, '', config, [])

In [8]:
#attributes = ['auftragsart_art']
attributes = ['ausschreibung_cpv', 'auftragsart_art','beschaffungsstelle_plz', 'auftragsart','gatt_wto','lose','teilangebote', 'varianten','sprache',]
trainer.attributes = attributes

In [9]:
# Choose a bidder to train a model for (number of positive marked after the name)

# === THESIS ===

#anbieter = 'Alpiq AG' #430
#anbieter = 'Swisscom' #302
#anbieter = 'Kummler + Matter AG' #160
anbieter = 'Siemens AG' #532

#anbieter = 'G. Baumgartner AG' #65
#anbieter = 'ELCA Informatik AG' #125
#anbieter = 'Thermo Fisher Scientific (Schweiz) AG' #160
#anbieter = 'Arnold AG' #82

#anbieter = 'Riget AG' #21
#anbieter = 'isolutions AG' #16
#anbieter = 'CSI Consulting AG' #21
#anbieter = 'Aebi & Co. AG Maschinenfabrik' #15

#anbieter = 'DB Schenker AG' #6
#anbieter = 'IT-Logix AG' #12
#anbieter = 'AVS Syteme AG' #14
#anbieter = 'Sajet SA' #7

# === TESTING ===

#anbieter = 'Marti AG' #456
#anbieter = 'Axpo AG' #40
#anbieter = 'Hewlett-Packard' #90
#anbieter = 'BG Ingénieurs Conseils' SA #116
#anbieter = 'Pricewaterhousecoopers' #42
#anbieter = 'Helbling Beratung + Bauplanung AG' #20
#anbieter = 'Ofrex SA' #52
#anbieter = 'PENTAG Informatik AG' #10
#anbieter = 'Wicki Forst AG' #12
#anbieter = 'T-Systems Schweiz' #18
#anbieter = 'Bafilco AG' #20
#anbieter = '4Video-Production GmbH' #3
#anbieter = 'Widmer Ingenieure AG' #6
#anbieter = 'hmb partners AG' #2
#anbieter = 'Planmeca' #4
#anbieter = 'K & M Installationen AG' #4

trainer.anbieter = anbieter

# Problemanalyse

In [10]:
trainer.queryData()

INFO:learn:sql done


(     anbieter_institution  anbieter_cpv  meldungsnummer  \
 0              Siemens AG      72000000          380721   
 1              Siemens AG      45216110          384773   
 2              Siemens AG      33124200          390515   
 3              Siemens AG      33113000          393571   
 4              Siemens AG      33115100          404931   
 5              Siemens AG      33115100          404931   
 6              Siemens AG      33111000          406415   
 7              Siemens AG      33115100          408753   
 8              Siemens AG      45000000          412181   
 9              Siemens AG      45000000          412459   
 10             Siemens AG      31200000          414053   
 11             Siemens AG      31200000          414053   
 12             Siemens AG      31200000          414053   
 13             Siemens AG      31200000          414053   
 14             Siemens AG      35262000          417223   
 15             Siemens AG      35262000

In [30]:
df = trainer.positives.append(trainer.negatives, ignore_index=True)
df['beschaffungsstelle_plz']

0          8090
1          8090
2          1205
3          1023
4          8057
5          8057
6          8091
7          8180
8          4601
9          4601
10         8022
11         8022
12         8022
13         8022
14         3011
15         3011
16         3011
17         3011
18         8022
19         8022
20         8022
21         8022
22         1011
23         1205
24         1205
25         1205
26         1205
27         3001
28         3030
29         3030
           ... 
3744314    8031
3744315    8031
3744316    8031
3744317    8031
3744318    8031
3744319    8031
3744320    8031
3744321    8031
3744322    8031
3744323    8031
3744324    8031
3744325    8031
3744326    8031
3744327    8031
3744328    8031
3744329    8031
3744330    8031
3744331    8031
3744332    8031
3744333    8031
3744334    4001
3744335    4001
3744336    4001
3744337    4001
3744338    4002
3744339    4002
3744340    4002
3744341    4002
3744342    8092
3744343    8451
Name: beschaffungsstelle

In [150]:
from db import engine

df = pd.read_sql('select DISTINCT(beschaffungsstelle_plz) from auftraggeber;', engine)
df

Unnamed: 0,beschaffungsstelle_plz
0,","
1,-
2,.
3,0
4,00-834
5,0000
6,00000
7,0000000000
8,00100
9,00197


In [151]:
import re


def transformToSingleInt(plz):
    try:
        result = int(plz)
        
    except ValueError:
        try:
            result = int(re.search(r"\d{4}", plz).group())
        except AttributeError:
            return None
        
    return result if result >= 1000 and result<= 9999 else None

df['beschaffungsstelle_plz'] = df['beschaffungsstelle_plz'].apply(transformToSingleInt)
df
#df[df['beschaffungsstelle_plz'].isnan()]

Unnamed: 0,beschaffungsstelle_plz
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,


In [152]:
df[df['beschaffungsstelle_plz'].isnull()]

Unnamed: 0,beschaffungsstelle_plz
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,


In [153]:
split = {
    'district': lambda x: math.floor(x/1000) if not math.isnan(x) else x,
    'area': lambda x: math.floor(x/100) if not math.isnan(x) else x,
}
prefix = 'b_plz_'

for key, applyFun in split.items():
    df[ prefix + key ] = df['beschaffungsstelle_plz'].apply(applyFun)
    
df.rename(columns={'beschaffungsstelle_plz': prefix+ 'ganz'}, inplace=True)

for key in ['ganz'] + list(split.keys()):
    key = prefix + key
    df = pd.concat([df, pd.get_dummies(df[key], prefix=key, dummy_na=True)], axis=1).drop(key, axis=1)
    
df

Unnamed: 0,b_plz_ganz_1000.0,b_plz_ganz_1001.0,b_plz_ganz_1002.0,b_plz_ganz_1003.0,b_plz_ganz_1004.0,b_plz_ganz_1005.0,b_plz_ganz_1006.0,b_plz_ganz_1007.0,b_plz_ganz_1008.0,b_plz_ganz_1009.0,...,b_plz_area_88.0,b_plz_area_89.0,b_plz_area_90.0,b_plz_area_91.0,b_plz_area_92.0,b_plz_area_93.0,b_plz_area_94.0,b_plz_area_95.0,b_plz_area_96.0,b_plz_area_nan
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
prepared_positives, prepared_negatives = trainer.prepare_data()
prepared_positives



Unnamed: 0,gatt_wto,lose,teilangebote,varianten,Y,cpv_division_3,cpv_division_9,cpv_division_14,cpv_division_15,cpv_division_16,...,beschaffung_plz_CH-8952,beschaffung_plz_CH_3003,beschaffung_plz_D-10557,beschaffung_plz_D-76137,beschaffung_plz_Gulshan 2,beschaffung_plz_MD 2009,beschaffung_plz_MD-2012,beschaffung_plz_W1H2 BQ,beschaffung_plz_—,beschaffung_plz_nan
0,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,2,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,2,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,1,2,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
prepared_negatives

Unnamed: 0,gatt_wto,lose,teilangebote,varianten,Y,cpv_division_3,cpv_division_9,cpv_division_14,cpv_division_15,cpv_division_16,...,beschaffung_plz_CH-8952,beschaffung_plz_CH_3003,beschaffung_plz_D-10557,beschaffung_plz_D-76137,beschaffung_plz_Gulshan 2,beschaffung_plz_MD 2009,beschaffung_plz_MD-2012,beschaffung_plz_W1H2 BQ,beschaffung_plz_—,beschaffung_plz_nan
3687,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3691,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3695,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3795,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3796,1,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3940,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3944,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3945,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3949,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4049,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
filter_attributes = ['meldungsnummer'] + trainer.attributes
df = merged[filter_attributes].copy()

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score

algorithm = 'decision_tree'
max_depth = trainer.config[algorithm]['max_depth']
max_features = trainer.config[algorithm]['max_features']
classifier = DecisionTreeClassifier(
    max_depth=max_depth,
    max_features=max_features
)
runIndex = 1

x_test, y_test = trainer.trainModel(sample, classifier, algorithm, runIndex)
# train, test = train_test_split(sample, random_state=runIndex)
# x_test = test.drop(['Y'], axis=1)
# x_train = train.drop(['Y'], axis=1)

# y_test = test[['Y']].copy()
# y_train = train[['Y']]

# classifier = classifier.fit(x_train, y_train['Y'])

# prediction = classifier.predict(x_test)


# y_test['run'] = runIndex
# x_test['run'] = runIndex

# y_test['prediction'] = prediction

# y_test['correct'] = y_test['prediction'] == y_test['Y']

[0.7482993197278912, 0.6870748299319728, 0.6258503401360545, 0.6190476190476191, 0.6870748299319728]
Avg. CV Score | decision_tree Run 1: 0.67


In [19]:
y_test

Unnamed: 0,Y,run,prediction,correct
298,1,1,0,False
758,0,1,0,True
655,0,1,0,True
152,1,1,1,True
264,1,1,0,False
717,0,1,0,True
575,0,1,0,True
80,1,1,0,False
709,0,1,0,True
533,0,1,0,True


# Archive

In [19]:
prepared_positives.values.astype(np.float)

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 3., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
prepared_negatives.values.astype(np.float)

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
all(np.all(np.isfinite(prepared_positives)))

True

In [16]:
all(np.all(np.isfinite(prepared_negatives)))

True

In [18]:
prepared_positives.isnull().values.any()

False

In [17]:
prepared_negatives.isnull().values.any()

False