In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from train import ModelTrainer
from collection import Collection

In [None]:
try:
    collection
except NameError:
    collection = Collection()

# Part 1: Setup Helpers (manual)

## Persist current collection

In [None]:
collection.to_file('dbs/tune.json')

## Restore from file (careful)

In the next box you can restore a list of runs from a json file

In [None]:
collection.import_file('dbs/tune.json', force=True)

## Reset model cache

This needs to be done when a new query should be done e.g. anbieter has changed

Maybe implement cache invalidation in ModelTrainer

In [None]:
trainer.resetSQLData()

## set config

In [None]:
trainer.config = config

## Set attributes in trainer

# Part 2: Config

In [None]:
# Prepare Attributes
def cleanData(df, filters):
#    if 'beschaffungsstelle_plz' in filters:
#        df[['beschaffungsstelle_plz']] = df[['beschaffungsstelle_plz']].applymap(ModelTrainer.tonumeric)
    if 'gatt_wto' in filters:
        df[['gatt_wto']] = df[['gatt_wto']].applymap(ModelTrainer.unifyYesNo)
    if 'anzahl_angebote' in filters:
        df[['anzahl_angebote']] = df[['anzahl_angebote']].applymap(ModelTrainer.tonumeric)
    if 'teilangebote' in filters:
        df[['teilangebote']] = df[['teilangebote']].applymap(ModelTrainer.unifyYesNo)
    if 'lose' in filters:
        df[['lose']] = df[['lose']].applymap(ModelTrainer.unifyYesNo)
    if 'varianten' in filters:
        df[['varianten']] = df[['varianten']].applymap(ModelTrainer.unifyYesNo)
    if 'auftragsart_art' in filters:
        auftrags_art_df = pd.get_dummies(df['auftragsart_art'], prefix='aftrgsrt',dummy_na=True)
        df = pd.concat([df,auftrags_art_df],axis=1).drop(['auftragsart_art'],axis=1)
    if 'sprache' in filters:
        sprache_df = pd.get_dummies(df['sprache'], prefix='lang',dummy_na=True)
        df = pd.concat([df,sprache_df],axis=1).drop(['sprache'],axis=1)
    if 'auftragsart' in filters:
        auftragsart_df = pd.get_dummies(df['auftragsart'], prefix='auftr',dummy_na=True)
        df = pd.concat([df,auftragsart_df],axis=1).drop(['auftragsart'],axis=1)
    if 'beschaffungsstelle_plz' in filters:
        plz_df = pd.get_dummies(df['beschaffungsstelle_plz'], prefix='beschaffung_plz',dummy_na=True)
        df = pd.concat([df,plz_df],axis=1).drop(['beschaffungsstelle_plz'],axis=1)
    return df

In [None]:
select_anbieter = (
    "anbieter.anbieter_id, "
    "anbieter.institution as anbieter_institution, "
    "cpv_dokument.cpv_nummer as anbieter_cpv, "
    "ausschreibung.meldungsnummer"
)
# anbieter_CPV are all the CPVs the Anbieter ever won a procurement for. So all the CPVs they are interested in. 
select_ausschreibung = (
    "anbieter.anbieter_id, "
    "auftraggeber.institution as beschaffungsstelle_institution, "
    "auftraggeber.beschaffungsstelle_plz, "
    "ausschreibung.gatt_wto, "
    "ausschreibung.sprache, "
    "ausschreibung.auftragsart, "
    "ausschreibung.auftragsart_art, "
    "ausschreibung.lose, "
    "ausschreibung.teilangebote, "
    "ausschreibung.varianten, "
   # "ausschreibung.titel, "
    "ausschreibung.bietergemeinschaft, "
    "cpv_dokument.cpv_nummer as ausschreibung_cpv, "
    "ausschreibung.meldungsnummer as meldungsnummer2"
)

In [None]:
config = {
    # ratio that the positive and negative responses have to each other
    'positive_to_negative_ratio': 0.5,
    # Percentage of training set that is used for testing (Recommendation of at least 25%)
    'test_size': 0.25,
    'runs': 100,
    #'enabled_algorithms': ['random_forest'],
    'enabled_algorithms': ['random_forest', 'decision_tree', 'gradient_boost'],
    'random_forest': {
        # Tune Random Forest Parameter
        'n_estimators': 400,
        'max_features': 'sqrt',
        'max_depth': None,
        'min_samples_split': 2
    },
    'decision_tree': {
        'max_depth': 6,
        'max_features': 'sqrt'
    },
    'gradient_boost': {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 6,
        'max_features': 'sqrt'
    }
}

In [None]:
try:
    trainer
except NameError:
    trainer = ModelTrainer(select_anbieter, select_ausschreibung, '', config, cleanData, [])

In [None]:
#attributes = ['auftragsart_art']
attributes = ['auftragsart_art','beschaffungsstelle_plz','gatt_wto','lose','teilangebote', 'varianten','sprache',]
trainer.attributes = attributes

In [None]:
# Choose a bidder to train a model for (number of positive marked after the name)

# === THESIS ===

#anbieter = 'Alpiq AG' #430
#anbieter = 'Swisscom' #302
anbieter = 'Kummler + Matter AG' #160
#anbieter = 'Siemens AG' #532

#anbieter = 'G. Baumgartner AG' #65
#anbieter = 'ELCA Informatik AG' #125
#anbieter = 'Thermo Fisher Scientific (Schweiz) AG' #160
#anbieter = 'Arnold AG' #82

#anbieter = 'Riget AG' #21
#anbieter = 'isolutions AG' #16
#anbieter = 'CSI Consulting AG' #21
#anbieter = 'Aebi & Co. AG Maschinenfabrik' #15

#anbieter = 'DB Schenker AG' #6
#anbieter = 'IT-Logix AG' #12
#anbieter = 'AVS Syteme AG' #14
#anbieter = 'Sajet SA' #7

# === TESTING ===

#anbieter = 'Marti AG' #456
#anbieter = 'Axpo AG' #40
#anbieter = 'Hewlett-Packard' #90
#anbieter = 'BG Ingénieurs Conseils' SA #116
#anbieter = 'Pricewaterhousecoopers' #42
#anbieter = 'Helbling Beratung + Bauplanung AG' #20
#anbieter = 'Ofrex SA' #52
#anbieter = 'PENTAG Informatik AG' #10
#anbieter = 'Wicki Forst AG' #12
#anbieter = 'T-Systems Schweiz' #18
#anbieter = 'Bafilco AG' #20
#anbieter = '4Video-Production GmbH' #3
#anbieter = 'Widmer Ingenieure AG' #6
#anbieter = 'hmb partners AG' #2
#anbieter = 'Planmeca' #4
#anbieter = 'K & M Installationen AG' #4

trainer.anbieter = anbieter

# Part 3: Run

In [None]:
output = trainer.run()

In [None]:
output

In [None]:
collection.append(output)

In [None]:
[ item['anbieter'] for item in collection.list]

In [None]:
collection.list

In [None]:
collection.get_all_as_df('random_forest').drop(['sample_size_mean'], axis=1)

In [None]:
next(item['attributes'] for index, item in enumerate(collection.list) if item['anbieter']=='ELCA Informatik AG')

In [None]:
next(item for item in collection.list if item['anbieter']=='ELCA Informatik AG')['random_forest']