# Single Run, Train Model

This notebook is useful to do single runs with the ModelTrainer.
This could mean a combination of 'anbieter', 'attributes' or any other configuration available in the config object

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from learn import ModelTrainer
from collection import Collection

In [2]:
try:
    collection
except NameError:
    collection = Collection()

# Part 1: Setup Helpers (manual)

This part can be skipped in most runs and is only useful to handle special situations


## Persist current collection

In [None]:
collection.to_file('dbs/sql-query-fix.json')

## Restore from file (careful)

In the next box you can restore a list of runs from a json file

In [None]:
collection.import_file('dbs/sql-query-fix.json', force=True)

## Reset model cache

This needs to be done when a new query should be done e.g. anbieter has changed

Maybe implement cache invalidation in ModelTrainer

In [None]:
trainer.resetSQLData()

## set config

In [23]:
trainer.config = config

## Set attributes in trainer

# Part 2: Config

Here you can configure the run by specifying some variables

The variables are hopefully self explanatory

In [3]:
cleanData = lambda x: x
# Prepare Attributes
#def cleanData(df, filters):
#    if 'gatt_wto' in filters:
#        df[['gatt_wto']] = df[['gatt_wto']].applymap(ModelTrainer.unifyYesNo)
#    if 'anzahl_angebote' in filters:
#        df[['anzahl_angebote']] = df[['anzahl_angebote']].applymap(ModelTrainer.tonumeric)
#    if 'teilangebote' in filters:
#        df[['teilangebote']] = df[['teilangebote']].applymap(ModelTrainer.unifyYesNo)
#    if 'lose' in filters:
#        df[['lose']] = df[['lose']].applymap(ModelTrainer.unifyYesNo)
#    if 'varianten' in filters:
#        df[['varianten']] = df[['varianten']].applymap(ModelTrainer.unifyYesNo)
#    if 'auftragsart_art' in filters:
#        auftrags_art_df = pd.get_dummies(df['auftragsart_art'], prefix='aftrgsrt',dummy_na=True)
#        df = pd.concat([df,auftrags_art_df],axis=1).drop(['auftragsart_art'],axis=1)
#    if 'sprache' in filters:
#        sprache_df = pd.get_dummies(df['sprache'], prefix='lang',dummy_na=True)
#        df = pd.concat([df,sprache_df],axis=1).drop(['sprache'],axis=1)
#    if 'auftragsart' in filters:
#        auftragsart_df = pd.get_dummies(df['auftragsart'], prefix='auftr',dummy_na=True)
#        df = pd.concat([df,auftragsart_df],axis=1).drop(['auftragsart'],axis=1)
#    if 'beschaffungsstelle_plz' in filters:
#        plz_df = pd.get_dummies(df['beschaffungsstelle_plz'], prefix='beschaffung_plz',dummy_na=True)
#        df = pd.concat([df,plz_df],axis=1).drop(['beschaffungsstelle_plz'],axis=1)
#    return df

In [4]:
# anbieter_CPV are all the CPVs the Anbieter ever won a procurement for. So all the CPVs they are interested in. 
select = (
    "ausschreibung.meldungsnummer, "
    "anbieter.institution as anbieter_institution, "
    "auftraggeber.beschaffungsstelle_plz, "
    "ausschreibung.gatt_wto, "
    "ausschreibung.sprache, "
    "ausschreibung.auftragsart, "
    "ausschreibung.auftragsart_art, "
    "ausschreibung.lose, "
    "ausschreibung.teilangebote, "
    "ausschreibung.varianten, "
   # "ausschreibung.titel, "
    "ausschreibung.bietergemeinschaft, "
    "cpv_dokument.cpv_nummer as ausschreibung_cpv"
)

In [21]:
config = {
    # ratio that the positive and negative responses have to each other
    'positive_to_negative_ratio': 0.5,
    # Percentage of training set that is used for testing (Recommendation of at least 25%)
    'test_size': 0.25,
    'draw_tree': True,
    'runs': 100,
    #'enabled_algorithms': ['random_forest'],
    # 'skip_cross_val': True,
    #'draw_tree': True,
    # 'enabled_algorithms': ['random_forest', 'decision_tree', 'gradient_boost'],
    'enabled_algorithms': ['random_forest', 'decision_tree'],
    'random_forest': {
        # Tune Random Forest Parameter
        'n_estimators': 100,
        'max_features': 'sqrt',
        'max_depth': None,
        'min_samples_split': 4
    },
    'decision_tree': {
        'max_depth': 30,
        'min_samples_split': 4,
        'max_features': 'sqrt'
    },
    'gradient_boost': {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 6,
        'max_features': 'sqrt'
    }
}

In [6]:
try:
    trainer
except NameError:
    trainer = ModelTrainer(select, '', config, [])

In [7]:
#attributes = ['auftragsart_art']
#attributes = ['ausschreibung_cpv', 'auftragsart_art']
attributes = ['ausschreibung_cpv', 'auftragsart_art','beschaffungsstelle_plz', 'auftragsart','gatt_wto','lose','teilangebote', 'varianten','sprache',]
trainer.attributes = attributes

In [8]:
# Choose a bidder to train a model for (number of positive marked after the name)

# === THESIS ===

anbieter = 'Alpiq AG' #430
#anbieter = 'Swisscom' #302
#anbieter = 'Kummler + Matter AG' #160
#anbieter = 'Siemens AG' #532

#anbieter = 'G. Baumgartner AG' #65
#anbieter = 'ELCA Informatik AG' #125
#anbieter = 'Thermo Fisher Scientific (Schweiz) AG' #160
#anbieter = 'Arnold AG' #82

#anbieter = 'Riget AG' #21
#anbieter = 'isolutions AG' #16
#anbieter = 'CSI Consulting AG' #21
#anbieter = 'Aebi & Co. AG Maschinenfabrik' #15

#anbieter = 'DB Schenker AG' #6
#anbieter = 'IT-Logix AG' #12
#anbieter = 'AVS Syteme AG' #14
#anbieter = 'Sajet SA' #7

# === TESTING ===

#anbieter = 'Marti AG' #456
#anbieter = 'Axpo AG' #40
#anbieter = 'Hewlett-Packard' #90
#anbieter = 'BG Ingénieurs Conseils' SA #116
#anbieter = 'Pricewaterhousecoopers' #42
#anbieter = 'Helbling Beratung + Bauplanung AG' #20
#anbieter = 'Ofrex SA' #52
#anbieter = 'PENTAG Informatik AG' #10
#anbieter = 'Wicki Forst AG' #12
#anbieter = 'T-Systems Schweiz' #18
#anbieter = 'Bafilco AG' #20
#anbieter = '4Video-Production GmbH' #3
#anbieter = 'Widmer Ingenieure AG' #6
#anbieter = 'hmb partners AG' #2
#anbieter = 'Planmeca' #4
#anbieter = 'K & M Installationen AG' #4

trainer.anbieter = anbieter

# Part 3: Run

Use the configured ModelTrainer to train a model and check the metrics in the output

In [24]:
output = trainer.run()



[0.9024390243902439, 0.8048780487804879, 0.975609756097561, 0.975609756097561, 0.8292682926829268]
Avg. CV Score | decision_tree Run 0: 0.90
[0.9512195121951219, 0.926829268292683, 0.9024390243902439, 0.9024390243902439, 0.8048780487804879]
Avg. CV Score | decision_tree Run 1: 0.90
[0.975609756097561, 0.8536585365853658, 0.975609756097561, 0.926829268292683, 1.0]
Avg. CV Score | decision_tree Run 2: 0.95
[0.9024390243902439, 0.9512195121951219, 0.9024390243902439, 0.9024390243902439, 0.9024390243902439]
Avg. CV Score | decision_tree Run 3: 0.91
[0.9024390243902439, 0.9024390243902439, 0.9024390243902439, 0.9512195121951219, 0.8536585365853658]
Avg. CV Score | decision_tree Run 4: 0.90
[0.8780487804878049, 0.926829268292683, 1.0, 0.926829268292683, 0.9024390243902439]
Avg. CV Score | decision_tree Run 5: 0.93
[0.9024390243902439, 0.9512195121951219, 0.9512195121951219, 0.9024390243902439, 0.8780487804878049]
Avg. CV Score | decision_tree Run 6: 0.92
[0.8536585365853658, 0.85365853658536

[0.926829268292683, 0.9024390243902439, 0.926829268292683, 0.975609756097561, 0.9512195121951219]
Avg. CV Score | decision_tree Run 59: 0.94
[1.0, 0.9512195121951219, 0.8780487804878049, 0.9024390243902439, 0.9024390243902439]
Avg. CV Score | decision_tree Run 60: 0.93
[0.8536585365853658, 0.975609756097561, 1.0, 0.975609756097561, 0.975609756097561]
Avg. CV Score | decision_tree Run 61: 0.96
[0.8536585365853658, 0.975609756097561, 0.9024390243902439, 0.975609756097561, 0.926829268292683]
Avg. CV Score | decision_tree Run 62: 0.93
[0.9512195121951219, 0.8780487804878049, 0.9512195121951219, 0.9512195121951219, 0.8780487804878049]
Avg. CV Score | decision_tree Run 63: 0.92
[0.8536585365853658, 0.9512195121951219, 1.0, 0.975609756097561, 0.9512195121951219]
Avg. CV Score | decision_tree Run 64: 0.95
[0.9024390243902439, 0.8292682926829268, 0.8048780487804879, 0.9024390243902439, 0.975609756097561]
Avg. CV Score | decision_tree Run 65: 0.88
[0.975609756097561, 0.926829268292683, 0.9756097

In [25]:
pd.DataFrame.from_dict(output['random_forest']['data'])

KeyError: 'random_forest'

In [None]:
positives, negatives, duplicates = trainer.prepare_data()

In [None]:
negatives

In [None]:
len(positives)

In [None]:
pd.DataFrame.from_dict(output['duplicates'])

In [None]:
collection.append(output)

In [None]:
collection.get_all_as_df('random_forest')

In [None]:
collection.list