# TODOS
If there are Todo items or general comments, let's write them on the top of the cell in a separeted comment starting with TODO

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from db import connection, engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc
import pandas as pd
import numpy as np
import helpers as fdn

In [None]:
# What to select for Anbieter
selects_an = (
    'anbieter.anbieter_id, '
    'anbieter.anbieter_plz, '
    'anbieter.institution as anbieter_institution, '
    'zuschlag.preis, '
    'zuschlag.gatt_wto, '
    'zuschlag.anzahl_angebote, ' 
#    'auftraggeber.institution as beschaffungsstelle_institution, ' # maybe
#    'auftraggeber.beschaffungsstelle_plz, ' # maybe
#    'projekt.projekt_titel, '
    'cpv_dokument.cpv_nummer,'
    'cpv.cpv_deutsch'
)

# What to select for Ausschreibung
selects_aus = (
    'anbieter.anbieter_id, '
    'ausschreibung.gatt_wto, '
    'auftraggeber.institution as beschaffungsstelle_institution, '
    'auftraggeber.beschaffungsstelle_plz, '
    'cpv_dokument.cpv_nummer, '
    'cpv.cpv_deutsch')

data_an = fdn.getFromSimap(selects_an)
data_aus = fdn.getFromSimap(selects_aus)
cpvRegister = fdn.getCpvRegister()

In [None]:
data_an.head()

In [None]:
data_aus.head()

In [None]:
data_an.anzahl_angebote.value_counts()

In [None]:
adecco = pd.DataFrame(data_an["anbieter_institution"] == "Adecco AG")

In [None]:
adecco["anbieter_institution"].value_counts()

In [None]:
# Count how often an Institution is occurig as Anbieter / how many they have won
tmp = pd.DataFrame(data_an["anbieter_institution"].value_counts())
tmp[tmp["anbieter_institution"]> 100]

In [None]:
fdn.getCpvCount('Swisscom')

In [None]:
fdn.getCpvDiversity('Adecco AG')


In [None]:
select_an = (
    "anbieter.anbieter_id, "
    "anbieter.anbieter_plz, "
    "anbieter.institution as anbieter_insitution, "
    "cpv_dokument.cpv_nummer as anbieter_cpv, "
    "ausschreibung.meldungsnummer"
)
# anbieter_CPV are all the CPVs the Anbieter ever won a procurement for. So all the CPVs they are interested in. 
select_aus = (
    "anbieter.anbieter_id, "
    "auftraggeber.institution as beschaffungsstelle_institution, "
    "auftraggeber.beschaffungsstelle_plz, "
    "ausschreibung.gatt_wto, "
    "cpv_dokument.cpv_nummer as ausschreibung_cpv, "
    "ausschreibung.meldungsnummer"
)
# ausschreibung_cpv
data_pos = fdn.getResponses(select_an, select_aus, "Adecco AG", True)
df_pos = data_pos.copy()

In [None]:
# Display DF properties
print(df_pos.describe())

In [None]:
# TODO: Check effect without changing the PLZ

# Change Postleitzahl into a similar format. Unidentifiable PLZs will result in 0 
df_pos[['anbieter_plz']] = df_pos[['anbieter_plz']].applymap(fdn.tonumeric)
df_pos[['beschaffungsstelle_plz']] = df_pos[['beschaffungsstelle_plz']].applymap(fdn.tonumeric)

In [None]:
df_pos.describe()

In [None]:
# TODO add a nicer plot version to paper

# Plot the diversity in CPV
grouped_df = df_pos.groupby(['anbieter_cpv']).size()
total = sum(grouped_df)
grouped_df.plot.pie(figsize=(6,6), autopct=lambda p: '{:.0f}'.format(p * total / 100))

In [None]:
# TODO create a faster data_neg if selecting randomly from beginning instead of first selecting all, then subselecting randomly. Depends on how we use the final app --> Create one DF per day?.

# Create a new DF containing all the negative responses. Here, the ones not containing "Adecco AG"
data_neg = fdn.getResponses(select_an, select_aus, "Adecco AG", False)
df_neg_full = data_neg.copy()

In [None]:
# TODO Take different sample sizes & T/F-Ratios into account while testing models. Refactor size and ratio choice into function?

# Create a random DF subset ussed to train the model on
df_neg= df_neg_full.sample(8000)

In [None]:
# Show the CPV diversity
df_pos.ausschreibung_cpv.value_counts()

In [None]:
# Count positive sample size
len(df_pos.index)

In [None]:
# Assign pos/neg lables to both DFs
df_pos['Y']=1
df_neg['Y']=0

In [None]:
# Merge the DFs into one
df_appended = df_pos.append(df_neg, ignore_index=True)

In [None]:
# Clean PLZ property
df_appended[['anbieter_plz']] = df_appended[['anbieter_plz']].applymap(fdn.tonumeric)
df_appended[['beschaffungsstelle_plz']] = df_appended[['beschaffungsstelle_plz']].applymap(fdn.tonumeric)

In [None]:
# Shuffle the df
df_tree = df_appended.sample(frac=1)
len(df_tree.index)

In [None]:
# Put responses in one arry and all diesired properties in another
y = df_tree.iloc[:,[11]]
x = df_tree.iloc[:,[1,3,7,9]]

In [None]:
print(x.head())
print(y.head())

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25)

In [None]:
# train the model on training sets
clf = tree.DecisionTreeClassifier()
clf = clf.fit(xtrain, ytrain)

In [None]:
# predict on the test sets
res = clf.predict(xtest)

In [None]:
ytest["res"]= res
ytest['richtig'] = ytest['res']==ytest['Y']

In [None]:
ytest.richtig.value_counts()

In [None]:
ytest[ytest["richtig"]==False]

In [None]:
tp = ytest[(ytest['Y']==1) & (ytest['res']==1)]
tn = ytest[(ytest['Y']==0) & (ytest['res']==0)]
fp = ytest[(ytest['Y']==0) & (ytest['res']==1)]
fn = ytest[(ytest['Y']==1) & (ytest['res']==0)]

# Metrics: Confusion Matrix
# Columns: Actual class (F,T), rows: Prected class (F,T)
confusion_matrix(ytest.Y, res)

In [None]:
# Calculate classification accuracy / accuracy score
acc = (len(tp) + len(tn)) / (len(tp) + len(tn) + len(fp) + len(fn))
print('Accuracy: {:0.5f}'.format(acc))

In [None]:
# Calculate classification accuracy / accuracy score
print('Accuracy: {:0.5f}'.format(accuracy_score(ytest.Y, res)))

In [None]:
fpr, tpr, thresholds = roc_curve(ytest.Y, res)
print(fpr, tpr, thresholds)

In [None]:
fpr

In [None]:
# Compute Area Under the Curve (AUC)
auc(fpr, tpr)

In [None]:
# True positive rate (sensitivity, recall, correctly identified)
tpr = len(tp) / (len(tp) + len(fn))
print('True Positive Rate: {:0.4f}'.format(tpr*100))

In [None]:
# False positive rate ("false alarm rate")
fpr = len(fp) / (len(fp) + len(tn))
print('False Positive Rate: {:0.4f}'.format(fpr*100))

In [None]:
# TODO How do we want to proceed with the different anieter_cpv and ausschreibung_cpv?
len(df_pos.query('anbieter_cpv != ausschreibung_cpv'))

In [None]:
# cpvRegister.loc[cpvRegister['cpv_nummer'] == 75120000, ['cpv_nummer', 'cpv_deutsch']]

def viewFalseResponses(resp, key):
    d = {
        'cpv_nummer': [],
        'cpv_deutsch': []
    }
    for i in resp.index.tolist():
        d['cpv_nummer'].append(cpvRegister.loc[cpvRegister['cpv_nummer'] == df_tree.loc[i][key], 'cpv_nummer'].item())
        d['cpv_deutsch'].append(cpvRegister.loc[cpvRegister['cpv_nummer'] == df_tree.loc[i][key], 'cpv_deutsch'].item())
    return pd.DataFrame(data=d)

In [None]:
viewFalseResponses(fp, 'anbieter_cpv')