# Test Suite
---

# Setup

## Prepare Functionality

In [1]:
%matplotlib inline
import math
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from db import connection, engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc, f1_score
from sklearn.utils import shuffle

from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
import numpy as np
import helpers as fdn

In [2]:
print('Your pandas version is {}. Please use version 0.23.1'.format(pd.__version__))
print('Your numpy version is {}. Please use version 1.13.1'.format(np.__version__))
# import sklearn
# print('The scikit-learn version is {}. Please use version 0.20.1'.format(sklearn.__version__))

Your pandas version is 0.23.1. Please use version 0.23.1
Your numpy version is 1.13.1. Please use version 1.13.1


In [3]:
# Prepare Attributes
def cleanData(df, filters):
    if 'beschaffungsstelle_plz' in filters:
        df[['beschaffungsstelle_plz']] = df[['beschaffungsstelle_plz']].applymap(fdn.tonumeric)
    if 'gatt_wto' in filters:
        df[['gatt_wto']] = df[['gatt_wto']].applymap(fdn.unifyYesNo)
    if 'preis' in filters:
        df[['preis']] = df[['preis']].applymap(fdn.createPriceCategory)
    if 'anzahl_angebote' in filters:
        df[['anzahl_angebote']] = df[['anzahl_angebote']].applymap(fdn.tonumeric)
    if 'teilangebote' in filters:
        df[['teilangebote']] = df[['teilangebote']].applymap(fdn.unifyYesNo)
    if 'lose' in filters:
        df[['lose']] = df[['lose']].applymap(fdn.unifyYesNo)
    if 'varianten' in filters:
        df[['varianten']] = df[['varianten']].applymap(fdn.unifyYesNo)
    if 'projekt_titel' in filters:
        vectorizer = CountVectorizer(binary=True)
        X = vectorizer.fit_transform(df['projekt_titel'].values)
        text_columns = vectorizer.get_feature_names()
        title_df = pd.DataFrame(X.todense(), columns=text_columns)
        df = pd.concat([df, title_df], axis=1)
        df = df.drop('projekt_titel', axis=1)
    return df

In [4]:
def prepareForRun(df_pos, df_neg_all, filterAttributes):
    # What attributes the model will be trained by
    filters = ['Y', 'ausschreibung_cpv'] + filterAttributes
    df_ready_all = []
    for df_neg in df_neg_all:
        # Merge positive and negative df into one, only use selected attributes
        df_tmp = df_pos.append(df_neg, ignore_index=True)[filters].copy()
        # Clean the data of all selected attributes
        df_tmp = cleanData(df_tmp, filterAttributes)
        df_ready_all.append(df_tmp)
    return df_ready_all

In [5]:
def runDecisionTree(dataFrame, trees, depth):
    xTests = [];
    yTests = [];
    for idx, df in enumerate(dataFrame): # enum to get index
        df
        run = shuffle(df)
        # Put responses in one arry and all diesired properties in another
        y = run.iloc[:, 0]
        x = run.iloc[:, 1:] # Every column but the first
        # create sets
        xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25)
        # train the model on training sets
    #    clf = tree.DecisionTreeClassifier()
        clf = RandomForestClassifier(n_estimators=trees, max_depth=depth, random_state=0)
        clf = clf.fit(xtrain, ytrain)
        print(clf.score(xtrain, ytrain))    # TODO: Explain
        # predict on the test sets
        prediction = clf.predict(xtest)
        # pandas.series to data frame
        df_ytest = ytest.to_frame()
        # add run number to df
        df_ytest['run'] = idx
        xtest['run'] = idx
        # add prediction to df
        df_ytest['prediction']= prediction
        # add result of run to df
        df_ytest['correct'] = df_ytest['prediction']==df_ytest['Y']
        # add run to run arrays
        xTests.append(xtest)
        yTests.append(df_ytest)
    return xTests, yTests

In [6]:
def getAccuracies(dfys):  
    res = pd.DataFrame(columns=['accuracy', 'f1_score', 'fn rate'])
    for dfy in dfys:
        acc = round(accuracy_score(dfy.Y, dfy.prediction), 4)
        f1 = round(f1_score(dfy.Y, dfy.prediction), 4)
        cm = confusion_matrix(dfy.Y, dfy.prediction)
        fnr = round(cm[1][0] / (cm[0][0] + cm[1][0]), 4)   # TODO: Double check if correct
        res.loc[len(res)] = [ acc*100, f1*100, fnr*100 ] # add row to end of df, *100 for better % readability
    return res

In [7]:
def getConfusionMatices(dfys):  
    res = pd.DataFrame(columns=['tn', 'tp', 'fp', 'fn'])
    for dfy in dfys:
        # ConfusionMatrix legende:
        # [tn, fp]
        # [fn, tp]
        cm = confusion_matrix(dfy.Y, dfy.prediction)
        res.loc[len(res)] = [ cm[0][0], cm[1][1], cm[0][1], cm[1][0] ]
    res.loc['sum'] = res.sum() # Summarize each column
    return res

## Choose Institution & Get Data
Only needs to be done once per bidder

In [8]:
# Choose a bidder to train a model for (number of positive marked after the name)

#anbieter = 'Arnold AG' #1006
#anbieter = 'Alpiq AG' #827
#anbieter = 'Siemens AG' #641
anbieter = 'Marti AG' #621
#anbieter = 'Swisscom' #602
#anbieter = 'Axpo AG' #577
#anbieter = 'Hewlett-Packard' #155
#anbieter = 'BG Ingénieurs Conseils' SA #151
#anbieter = 'Pricewaterhousecoopers' # 92
#anbieter = 'Helbling Beratung + Bauplanung AG' #67
#anbieter = 'Ofrex SA' #40
#anbieter = 'PENTAG Informatik AG' #40
#anbieter = 'Wicki Forst AG' #30
#anbieter = 'T-Systems Schweiz' #30
#anbieter = 'Bafilco AG' #20
#anbieter = '4Video-Production GmbH' #20
#anbieter = 'Widmer Ingenieure AG' #10
#anbieter = 'hmb partners AG' #10
#anbieter = 'Planmeca' #5
#anbieter = 'K & M Installationen AG' #5

In [None]:
select_anbieter = (
    "anbieter.anbieter_id, "
    "anbieter.institution as anbieter_institution, "
    "cpv_dokument.cpv_nummer as anbieter_cpv, "
    "ausschreibung.meldungsnummer"
)
# anbieter_CPV are all the CPVs the Anbieter ever won a procurement for. So all the CPVs they are interested in. 
select_ausschreibung = (
    "anbieter.anbieter_id, "
    "auftraggeber.institution as beschaffungsstelle_institution, "
    "auftraggeber.beschaffungsstelle_plz, "
    "ausschreibung.gatt_wto, "
    "ausschreibung.sprache, "
    "ausschreibung.auftragsart_art, "
    "ausschreibung.lose, "
    "ausschreibung.teilangebote, "
    "ausschreibung.varianten, "
  #  "ausschreibung.titel, " TODO: Projektbeschrieb
    "ausschreibung.bietergemeinschaft, "
    "projekt.projekt_titel, "
    "cpv_dokument.cpv_nummer as ausschreibung_cpv, "
    "ausschreibung.meldungsnummer"
)
# Get all positive and negative responses
responses_positive, full_negative = fdn.createAnbieterDf(select_anbieter, select_ausschreibung, anbieter)
responses_positive.head(3)

# Tuning

***
***ToDo:***
* TEST WITH BIDDERS OF DIFFERENT SIZE!
* Add Auftrags_art as catagory!!
* Create function to read all which tenderings are FPs / FNs
* Create a n times n input for all attributes
* Test with Random Forest
* Enable better auto Evaluation Feedback. Maybe some Graphs or something
* Prepare (and test with) more Attributes
* Test Model only with Tenderings from the same ream / CPV category
* ("ctrl + F" all TODOs in this file)
* (Take a look at warning when tree is run)

***Fragen:***
* Müssen wir die Freihänder beachten?
* Spielen Attribute des Zuschlags überhaupt eine Rolle?
* Welche Attribute wollen wir noch anschauen / einbringen?
* Wenn wir nur den CPV verwenden, ist die Anwendung besser als ein normaler Filter?
* Könnte unser Algorithmus einen Bias haben, da wir mehrer CPV miteinander kombinieren, wenn wir die Tables laden?

***

In [10]:
# ratio that the positive and negative responses have to each other
positive_to_negative_ratio = 2/3

In [11]:
# train n different models on n different (reproducable) sample sizes
runs = 15

In [12]:
# Attributes ready for use: 'beschaffungsstelle_plz', 'gatt_wto', 'lose', 'teilangebote', 'varianten'

# Next focus: 'beschaffungsstelle_institution', 'titel', 'sprache', 'auftragsart_art' <-- AUFTRAGSART!

# ???: 'Preis', 'anzahl_angebote'

attributes = ['beschaffungsstelle_plz', 'gatt_wto', 'lose', 'teilangebote', 'varianten', 'projekt_titel']

In [13]:
# Tune Random Forest Parameter
trees = 100
depth = 14

# Model Creation

In [14]:
# create the chosen amount reproducable samples for negative DataFrames with the ratio definded above
responses_negative_all = fdn.createNegativeResponses(
    full_negative,
    len(responses_positive),
    runs,
    positive_to_negative_ratio)

# Assign positive and negative lables to both DFs
responses_positive['Y'] = 1
for df in responses_negative_all:
    df['Y'] = 0

In [26]:
import warnings
warnings.filterwarnings('ignore') # hide some "slice of copy" warnings
df_for_decision_tree = prepareForRun(responses_positive, responses_negative_all, attributes)
xTests, yTests = runDecisionTree(df_for_decision_tree, trees, depth)

0.9726315789473684
0.9636842105263158
0.9768421052631578
0.9736842105263158
0.968421052631579
0.9647368421052631
0.9689473684210527
0.9673684210526315
0.9642105263157895
0.9573684210526315
0.978421052631579
0.9731578947368421
0.9742105263157895
0.9636842105263158
0.9747368421052631


# Evaluation

In [35]:
stats = {
    'attribute_count': len(df_for_decision_tree[0].columns),
}

pd.DataFrame.from_dict(stats, orient="index")

Unnamed: 0,0
attribute_count,2663


In [36]:
getConfusionMatices(yTests)

Unnamed: 0,tn,tp,fp,fn
0,396,206,17,15
1,404,201,12,17
2,381,222,13,18
3,371,234,15,14
4,363,245,13,13
5,375,236,10,13
6,384,220,11,19
7,407,202,14,11
8,393,217,13,11
9,386,215,6,27


In [18]:
print(attributes)
getAccuracies(yTests)

['beschaffungsstelle_plz', 'gatt_wto', 'lose', 'teilangebote', 'varianten', 'projekt_titel']


Unnamed: 0,accuracy,f1_score,fn rate
0,93.22,91.06,6.77
1,95.43,93.68,3.47
2,96.53,95.26,3.22
3,95.27,93.48,2.26
4,95.74,94.67,1.87
5,95.11,92.67,4.01
6,94.16,92.7,5.48
7,95.27,93.62,3.27
8,94.16,92.04,2.79
9,95.27,94.02,3.92
