In [1]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
from sklearn import metrics

from data import constants

############################## CONFIG ##############################
# To make the notebook reproducible (not guaranteed for pytorch on different releases/platforms!)
SEED_VALUE = 2
FILTER_OP = False
FILTER_WEAK = False

####################################################################


In [2]:
# Read training and test data
df_train = pd.read_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Training.pkl")
df_test = pd.read_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Test.pkl")

# df_train = df_train.sample(500)
# df_test = df_test.sample(500)

# Set id
id_columns = ['report_id', 'page', 'paragraph_no']
df_train["id"] = df_train.apply(lambda row: "_".join([str(row[c]) for c in id_columns]), axis=1)
df_test["id"] = df_test.apply(lambda row: "_".join([str(row[c]) for c in id_columns]), axis=1)

if FILTER_OP:
  df_train.cro.loc[df_train.cro == "OP"] = np.nan
  df_test.cro.loc[df_test.cro == "OP"] = np.nan

if FILTER_WEAK:
  df_train = df_train.query("neg_type != 'weak'")
  df_test = df_test.query("neg_type != 'weak'")

train_docs = df_train.groupby(["id"]).first().text
train_doc_labels = df_train.groupby(["id"]).cro.count()
train_doc_labels = (train_doc_labels > 0) * 1
assert len(train_docs) == len(train_doc_labels)

test_docs = df_test.groupby(["id"]).first().text
test_doc_labels = df_test.groupby(["id"]).cro.count()
test_doc_labels = (test_doc_labels > 0) * 1
assert len(test_docs) == len(test_doc_labels)

# Calculate weights
weights = {0:1.0, 1: len(train_doc_labels) / train_doc_labels.sum()}
print(f"Using weights of {weights}")

Using weights of {0: 1.0, 1: 25.32422586520947}


In [None]:
# TODO: Preprocessing (Lemma)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.utils import shuffle

docs_train, labels_train = shuffle(train_docs, train_doc_labels, random_state=SEED_VALUE)

weights = {0:1.0, 1: len(train_doc_labels) / train_doc_labels.sum()}

pipeline_svm = Pipeline([
    ('bow', CountVectorizer(strip_accents = 'ascii'), tokenizer=LemmaTokenizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC(probability=True, random_state=SEED_VALUE, class_weight=weights)),
    ])

# Parameters to tune automatically with a grid search
# Note: The nested estimator is accessible via the __estimator identifier
param_svm = [
  {
      'bow__ngram_range': [(1, 2)],
      'bow__max_features': [None, 100, 200],
      'bow__stop_words': ['english', None],
       'tfidf__use_idf': [True],
      'classifier__C': [1, 10, 100],
      'classifier__kernel': ['linear', 'rbf'],
  },
]

grid_clf = GridSearchCV(
    pipeline_svm,
    param_grid=param_svm,
    refit=True,
    n_jobs=-1, 
    scoring='roc_auc',
    # cv=StratifiedKFold(label_train, n_folds=5),
)

# Grid search fitting
grid_clf.fit(docs_train, labels_train)

In [None]:
cv_results = pd.DataFrame(grid_clf.cv_results_)

print(f"Best score: {grid_clf.best_score_}")
print(f"Best params: \n{grid_clf.best_params_}")

In [None]:
# Predict for test
preds = grid_clf.predict(test_docs)
preds_prob = grid_clf.predict_proba(test_docs)

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score, matthews_corrcoef

preds = preds_prob[:,1]
labels = test_doc_labels

test_roc_auc = roc_auc_score(labels, preds)
print("Test ROC AuC: ", test_roc_auc)

threshold = 0.8
preds_bool = (preds > threshold)

label_list = ["irrelevant", "relevant"]
print(classification_report(labels, preds_bool, target_names=label_list))

acc = accuracy_score(labels, preds_bool)
matthews_corr = matthews_corrcoef(labels, preds_bool)

print(f"Accuracy: {acc}")
print(f"Matthews: {matthews_corr}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_cm(labels, predictions, p=0.8):
  cm = confusion_matrix(labels, predictions > p)
  plt.figure(figsize=(5,5))
  sns.heatmap(cm, annot=True, fmt="d")
  plt.title('Confusion matrix @{:.0%}'.format(p))
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')

plot_cm(labels, preds)

In [None]:
# fpr, tpr, thresholds = roc_curve(testy, probs)
# pyplot.plot([0, 1], [0, 1], linestyle='--')
# pyplot.plot(fpr, tpr, marker='.')
# pyplot.show()
# auc_score = roc_auc_score(testy, probs)
# print('AUC: %.3f' % auc_score)


from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

def plot_roc(name, labels, predictions, **kwargs):
  fp, tp, _ = roc_curve(labels, predictions)

  fig = plt.plot(100 * fp, 100 * tp, label=name, linewidth=2, **kwargs)
  plt.xlabel('False positives [%]')
  plt.ylabel('True positives [%]')
  ax = plt.gca()
  ax.set_aspect('equal')
  plt.legend(loc='lower right')
  return fig

# plot_roc("Train Baseline", train_labels, train_predictions_baseline, color=colors[0])
fig = plot_roc("Test", labels, preds)

In [None]:
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

y_score = grid_clf.decision_function(test_docs)
average_precision = average_precision_score(labels, y_score)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

disp = plot_precision_recall_curve(grid_clf, test_docs, test_doc_labels)
disp.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))

In [None]:
from sklearn.metrics import auc
test_precision, test_recall, test_thresholds = precision_recall_curve(labels, preds)
test_auc_score = auc(test_recall, test_precision)
print("Test Precision/Recall AuC: ", test_auc_score)