# Test Suite Final
---

# Setup

## Prepare Functionality

In [1]:
%matplotlib inline
import math
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from db import connection, engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc, f1_score
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import matthews_corrcoef
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import helpers as fdn

In [2]:
print('My pandas version is {}. Please use version 0.23.1'.format(pd.__version__))
print('My numpy version is {}. Please use version 1.13.1'.format(np.__version__))
# import sklearn
# print('The scikit-learn version is {}. Please use version 0.20.1'.format(sklearn.__version__))

My pandas version is 0.23.1. Please use version 0.23.1
My numpy version is 1.16.2. Please use version 1.13.1


In [3]:
# Prepare Attributes
def cleanData(df, filters):
    if 'beschaffungsstelle_plz' in filters:
        df[['beschaffungsstelle_plz']] = df[['beschaffungsstelle_plz']].applymap(fdn.tonumeric)
    if 'gatt_wto' in filters:
        df[['gatt_wto']] = df[['gatt_wto']].applymap(fdn.unifyYesNo)
    if 'preis' in filters:
        df[['preis']] = df[['preis']].applymap(fdn.createPriceCategory)
    if 'anzahl_angebote' in filters:
        df[['anzahl_angebote']] = df[['anzahl_angebote']].applymap(fdn.tonumeric)
    if 'teilangebote' in filters:
        df[['teilangebote']] = df[['teilangebote']].applymap(fdn.unifyYesNo)
    if 'lose' in filters:
        df[['lose']] = df[['lose']].applymap(fdn.unifyYesNo)
    if 'varianten' in filters:
        df[['varianten']] = df[['varianten']].applymap(fdn.unifyYesNo)
    if 'auftragsart_art' in filters:
        auftrags_art_df = pd.get_dummies(df['auftragsart_art'], prefix='aftrgsrt',dummy_na=True)
        df = pd.concat([df,auftrags_art_df],axis=1).drop(['auftragsart_art'],axis=1)
    return df

In [4]:
def createNegativeResponses(full_neg, pos_df_size, amount_neg_df, pos_neg_ratio):
    all_negatives = [];
    sample_size = math.ceil(pos_df_size * (pos_neg_ratio + 1));
    for count in range(amount_neg_df):
        all_negatives.append(full_neg.sample(sample_size, random_state=count));
    return all_negatives;

In [5]:
def prepareForRun(df_pos, df_neg_all, filterAttributes):
    # What attributes the model will be trained by
    filters = ['Y', 'meldungsnummer', 'ausschreibung_cpv'] + filterAttributes
    df_ready_all = []
    for df_neg in df_neg_all:
        # Merge positive and negative df into one, only use selected attributes
        df_tmp = df_pos.append(df_neg, ignore_index=True)[filters].copy()
        # Clean the data of all selected attributes
        df_tmp = cleanData(df_tmp, filterAttributes)
        df_ready_all.append(df_tmp)
    return  df_ready_all

In [6]:
def prepareUnfilteredRun(df_pos, df_neg_all, filterAttributes):
    df_all = []
    for df_neg in df_neg_all:
        # Merge positive and negative df into one
        df_all.append(df_pos.append(df_neg, ignore_index=True).copy())
    return  df_all

In [7]:
def createModel(algorithm, randomState):
    if algorithm == 'rf':
        return RandomForestClassifier(n_estimators=trees, max_depth=depth, random_state=randomState)
    elif algorithm == 'gbt':
        return GradientBoostingClassifier(n_estimators=trees, learning_rate=1.0, max_depth=depth, random_state=randomState)
    else:
        return DecisionTreeClassifier()

In [8]:
def runTreeClassifier(dataFrame, classifier, test_size):
    xTests = [];
    yTests = [];
    for idx, df in enumerate(dataFrame): # enum to get index
        # Unique df where positives are appended to negatives -> Shuffle
        run = shuffle(df, random_state=idx) # run index as random state
        # Get each runs unique meldungsnummer
        unique_mn = run.meldungsnummer.unique()
        # Split the meldungsnummer between test and trainings set so there will be no bias in test set
        xUniqueTest, xUniqueTrain = train_test_split(unique_mn, test_size=test_size, random_state=idx)
        # Add the remaining attributes to meldungsnummern_features
        xAndYTest = run[run['meldungsnummer'].isin(xUniqueTest)].copy()
        xAndYTrain = run[run['meldungsnummer'].isin(xUniqueTrain)].copy()
        # Select all attributes but meldungsnummer
        xtest = xAndYTest.iloc[:, 2:]
        xtrain = xAndYTrain.iloc[:, 2:]
        # Only select the response result attributes
        ytest = xAndYTest.iloc[:, 0]
        ytrain = xAndYTrain.iloc[:, 0]
        # Train the model on training sets
        clf = classifier(algorithm='rf', randomState=idx).fit(xtrain, ytrain)
        # Predict on the test sets
        prediction = clf.predict(xtest)
        # Convert pandas.series to data frame
        df_ytest = ytest.to_frame()
        # Add run number to df
        df_ytest['run'] = idx
        xtest['run'] = idx
        # add prediction to df
        df_ytest['prediction']= prediction
        # add result of run to df
        df_ytest['correct'] = df_ytest['prediction']==df_ytest['Y']
        # add run to run arrays
        xTests.append(xtest)
        yTests.append(df_ytest)
        print('Finished run {}'.format(idx))
        if idx == 19:
            print(len(run))
            print(len(unique_mn))
            print(len(xtest))
            print(len(xTests[19]))
    return xTests, yTests

In [9]:
def getAccuracies(dfys):  
    res = pd.DataFrame(columns=['accuracy', 'MCC', 'fn rate'])
    for dfy in dfys:
        acc = round(accuracy_score(dfy.Y, dfy.prediction), 4)
        #f1 = round(f1_score(dfy.Y, dfy.prediction), 4)
        mcc = matthews_corrcoef(dfy.Y, dfy.prediction)
        cm = confusion_matrix(dfy.Y, dfy.prediction)
        fnr = round(cm[1][0] / (cm[1][1] + cm[1][0]), 4)
        res.loc[len(res)] = [ acc*100, mcc, fnr*100 ] # add row to end of df, *100 for better % readability
    return res

In [10]:
def getProjectTitle(meldungsnummer):
    query = """
    select projekt.projekt_titel from projekt, ausschreibung, cpv_dokument
    where projekt.projekt_id = ausschreibung.projekt_id
      and ausschreibung.meldungsnummer = cpv_dokument.meldungsnummer
      and ausschreibung.meldungsnummer = "{}";
    """.format(meldungsnummer)
    return pd.read_sql(query, connection);

In [11]:
def getConfusionMatices(dfys):  
    res = pd.DataFrame(columns=['tn', 'tp', 'fp', 'fn'])
    for dfy in dfys:
        # ConfusionMatrix legende:
        # [tn, fp]
        # [fn, tp]
        cm = confusion_matrix(dfy.Y, dfy.prediction)
        res.loc[len(res)] = [ cm[0][0], cm[1][1], cm[0][1], cm[1][0] ]
    res.loc['sum'] = res.sum() # Summarize each column
    return res

In [12]:
# Returns as data frame with all FalseNegatives or FalsePositives
def getFalseFullList(positive, originalRuns, runs, run):
    # Get FalsePositives
    if (positive):
        idxs = runs[run][(runs[run]['Y'] == 0) & (runs[run]['prediction'] == 1)].index.tolist()
    # Get FalseNegatives
    else:
        idxs = runs[run][(runs[run]['Y'] == 1) & (runs[run]['prediction'] == 0)].index.tolist()
    return originalRuns[run].ix[idxs]

# Prints the project titles of eitehr FalsePositives or FalseNegatives
def getFalseProjectTitle(positive, originalRuns, runs, run):
    for m in getFalseFullList(positive, originalRuns, runs, run).meldungsnummer:
        print(getProjectTitle(m))
        print('========') 

## Choose Institution & Get Data
Only needs to be done once per bidder

In [13]:
# Choose a bidder to train a model for (number of positive marked after the name)

#anbieter = 'Arnold AG' #1006
#anbieter = 'Alpiq AG' #827
#anbieter = 'Siemens AG' #641
#anbieter = 'Marti AG' #621
#anbieter = 'Swisscom' #602
#anbieter = 'Axpo AG' #577
#anbieter = 'Hewlett-Packard' #155
#anbieter = 'BG Ingénieurs Conseils' SA #151
#anbieter = 'Pricewaterhousecoopers' # 92
anbieter = 'Helbling Beratung + Bauplanung AG' #67
#anbieter = 'Ofrex SA' #40
#anbieter = 'PENTAG Informatik AG' #40
#anbieter = 'Wicki Forst AG' #30
#anbieter = 'T-Systems Schweiz' #30
#anbieter = 'Bafilco AG' #20
#anbieter = '4Video-Production GmbH' #20
#anbieter = 'Widmer Ingenieure AG' #10
#anbieter = 'hmb partners AG' #10
#anbieter = 'Planmeca' #5
#anbieter = 'K & M Installationen AG' #5

In [14]:
select_anbieter = (
    "anbieter.anbieter_id, "
    "anbieter.institution as anbieter_institution, "
    "cpv_dokument.cpv_nummer as anbieter_cpv, "
    "ausschreibung.meldungsnummer"
)
# anbieter_CPV are all the CPVs the Anbieter ever won a procurement for. So all the CPVs they are interested in. 
select_ausschreibung = (
    "anbieter.anbieter_id, "
    "auftraggeber.institution as beschaffungsstelle_institution, "
    "auftraggeber.beschaffungsstelle_plz, "
    "ausschreibung.gatt_wto, "
    "ausschreibung.sprache, "
    "ausschreibung.auftragsart_art, "
    "ausschreibung.lose, "
    "ausschreibung.teilangebote, "
    "ausschreibung.varianten, "
   # "ausschreibung.titel, "
    "ausschreibung.bietergemeinschaft, "
    "cpv_dokument.cpv_nummer as ausschreibung_cpv, "
    "ausschreibung.meldungsnummer as meldungsnummer2"
)
# Get all positive and negative responses
responses_positive, full_negative = fdn.createAnbieterDf(select_anbieter, select_ausschreibung, anbieter)
print('Number of Rows: {}'.format(len(responses_positive)))
responses_positive.head(3)

Number of Rows: 273


Unnamed: 0,anbieter_id,anbieter_institution,anbieter_cpv,meldungsnummer,anbieter_id.1,beschaffungsstelle_institution,beschaffungsstelle_plz,gatt_wto,sprache,auftragsart_art,lose,teilangebote,varianten,bietergemeinschaft,ausschreibung_cpv,meldungsnummer2
0,2423,Helbling Beratung + Bauplanung AG,48610000,435695,2423,armasuisse - Bundesamt für Rüstung,CH-3003,YES,DE,CPC7,nein,NO,NO,zugelassen,48610000,435695
1,2423,Helbling Beratung + Bauplanung AG,48610000,435695,2423,armasuisse - Bundesamt für Rüstung,CH-3003,YES,DE,CPC7,nein,NO,NO,zugelassen,48610000,435695
2,2423,Helbling Beratung + Bauplanung AG,48610000,435695,2423,armasuisse - Bundesamt für Rüstung,CH-3003,YES,DE,CPC7,nein,NO,NO,zugelassen,48610000,435695


# Tuning

***
***ToDo:***
* TEST WITH BIDDERS OF DIFFERENT SIZE!
* Create a n times n input for all attributes
* Enable better auto Evaluation Feedback. Maybe some Graphs or something
* Test Model only with Tenderings from the same ream / CPV category
* ("ctrl + F" all TODOs in this file)
* (Take a look at warning when tree is run)

***Fragen:***
* Müssen wir die Freihänder beachten?
* Gibt es mehrere Ausschreibungen pro Meldungsnummer?
* Wenn wir nur den CPV verwenden, ist die Anwendung besser als ein normaler Filter?

***

In [15]:
# ratio that the positive and negative responses have to each other
positive_to_negative_ratio = 2/3
# Percentage of training set that is used for testing (Recommendation of at least 25%)
test_size = 0.25

In [16]:
# train n different models on n different (reproducable) sample sizes
runs = 100

In [17]:
# Attributes ready for use: 'beschaffungsstelle_plz', 'gatt_wto', 'lose', 'teilangebote', 'varianten'

# Next focus: 'beschaffungsstelle_institution', 'titel', 'sprache', 'auftragsart_art' <-- AUFTRAGSART!

# ???: 'Preis', 'anzahl_angebote'

#attributes = ['beschaffungsstelle_plz', 'auftragsart_art']

#attributes = [ 'gatt_wto', 'lose', 'teilangebote', 'varianten', 'beschaffungsstelle_plz']
attributes = []

In [18]:
# Tune Random Forest Parameter
trees = 100
depth = 14

# Model Creation

In [19]:
# create the chosen amount of reproducable samples for negative DataFrames with the ratio definded above
responses_negative_all = createNegativeResponses(
    full_negative,
    len(responses_positive),
    runs,
    positive_to_negative_ratio)

# Assign positive and negative lables to both DFs
responses_positive['Y'] = 1
for df in responses_negative_all:
    df['Y'] = 0

KeyboardInterrupt: 

In [None]:
responses_positive.describe()

In [None]:
dataFrame = prepareForRun(responses_positive, responses_negative_all, attributes)
dataFrameRaw = prepareUnfilteredRun(responses_positive, responses_negative_all, attributes)

In [None]:
# Test pls delete
test_df = next(enumerate(dataFrame))[1]
test_df.describe()

In [None]:
import warnings
warnings.filterwarnings('ignore') # hide some "slice of copy" warnings
xTests, yTests = runTreeClassifier(dataFrame, createModel, test_size)

In [None]:
# delete
yTests

# Evaluation

In [None]:
print(attributes)
evaluation_matirx = pd.concat([getConfusionMatices(yTests), getAccuracies(yTests)], axis=1, sort=False)
print(evaluation_matirx["accuracy"].mean())
evaluation_matirx

In [None]:
evaluation_matirx['accuracy'].value_counts()

In [None]:
# Show FalsePositive (Run as last parameter)
getFalseFullList(True, dataFrameRaw, yTests, 1)

In [None]:
# Show FalseNegatives (Run as last parameter)
getFalseFullList(False, dataFrameRaw, yTests, 1)

In [None]:
# Print Project Titles of FalsePositves (Run as last parameter)
getFalseProjectTitle(True, dataFrameRaw, yTests, 1)

In [None]:
# Print Project Titles of FalseNegatives (Run as last parameter)
getFalseProjectTitle(False, dataFrameRaw, yTests, 1)

In [None]:
fdn.getCpvDiversity('Helbling Beratung + Bauplanung AG')

In [None]:
fdn.getCpvCount('Arnold AG')

In [None]:
#anbieter = 'Arnold AG' #1006
#anbieter = 'Alpiq AG' #827
#anbieter = 'Siemens AG' #641
#anbieter = 'Marti AG' #621
#anbieter = 'Swisscom' #602
#anbieter = 'Axpo AG' #577
#anbieter = 'Hewlett-Packard' #155
#anbieter = 'BG Ingénieurs Conseils' SA #151
#anbieter = 'Pricewaterhousecoopers' # 92
#anbieter = 'Helbling Beratung + Bauplanung AG' #67
#anbieter = 'Ofrex SA' #40
#anbieter = 'PENTAG Informatik AG' #40
#anbieter = 'Wicki Forst AG' #30
#anbieter = 'T-Systems Schweiz' #30
#anbieter = 'Bafilco AG' #20
#anbieter = '4Video-Production GmbH' #20
#anbieter = 'Widmer Ingenieure AG' #10
#anbieter = 'hmb partners AG' #10
#anbieter = 'Planmeca' #5
#anbieter = 'K & M Installationen AG' #5