In [1]:
# Filter all debugging messages from tensorflow 
#from silence_tensorflow import silence_tensorflow
#silence_tensorflow()

# Logistic Regression spits out warnings on large datasets. For now, there warning will be surpressed.
# This warning will get taken care of in a later version
#import warnings
#warnings.filterwarnings("ignore")

# Basis libraries
import numpy as np 
import pandas as pd 
import pickle
import os
from datetime import datetime

# Sklearn libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, mean_squared_error

# Embedders and Transformers
from sentence_transformers import SentenceTransformer

# XGBoost
import xgboost as xgb

# Tensorflow 
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

2022-05-12 14:35:39.022098: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-12 14:35:39.022136: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [77]:
def input_getter(path_statement):
    while True:
        user_input = input()
        print(path_statement, PATH)
        path_approval = input('(y/ n) ')
        if path_approval.lower() == 'y':
            break
        elif path_approval.lower() == 'n':
            print('>> Enter a new path: ')
        else:
            print(">> Sorry, that didn't work. Please enter again: ")
    return user_input
    

In [2]:
# get datetime dd/mm/YY H:M
now = datetime.now()
dt_string = now.strftime('%d-%m-%Y %H-%M')

# Read in the data with pandas, then convert text corpus to list
print('>> Please select the path, where your data is stored!')
print('>> On Windows the path might look like this: C:\\Users\\yourname\\data\\training_data.csv')
print('>> On MacOS/ Linux the path might look like this: home/user/data/training_data.csv')
print(' ')

# Get the path where the data is stored
PATH = input_getter('>> Is this the correct path? ->')

# Get the name of the features
print(' ')
print('>> Please provide the column name in which the texts are store in!')
COL_TEXTS = input_getter('>> Is this the correct column name? ->')

# Load the data with the provided info
df = pd.read_csv(PATH)
corpus = df[COL_TEXTS].to_list()
print('>> Data successfully loaded!')

# Lowering all words
for i in range(len(corpus)):
    corpus[i] = corpus[i].lower()

# Get the names of the labels
COL_LABEL = input_getter('>> Is this the correct column name? ->')




>> Please select the path, where your data is stored!
>> On Windows the path might look like this: C:\Users\yourname\data\training_data.csv
>> On MacOS/ Linux the path might look like this: home/user/data/training_data.csv
 
>> Is this the correct path? -> clickbait_data.csv
 
>> Please provide the column name in which the texts are store in!
>> Is this the correct column name? -> headline
>> Data successfully loaded!
 
>> Please provide the column name in which the labels are store in!
>> Is this the correct column name? -> clickbait


In [3]:
while True:
    print(' ')
    print('>> Please input a number to choose your method of embedding.')
    print('>> 1 - Transformer based embeddings (accurate, but slower)')
    print('>> 2 - TF-IDF Vectorizer (faster, but less accurate)')
    print(' ')

    choice = input()

    if choice == '1':
        # Instantiate a sentence transformer and create embeddings 
        print('>> Creating embeddings using transformer model, this might take a couple of minutes ...')
        sent_transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        embeddings = sent_transformer.encode(corpus)
        break

    else:
        # Intantiate a tf-idf vectorizer
        print('>> Creating TF-IDF embeddings ...')
        vect = TfidfVectorizer()
        embeddings = vect.fit_transform(corpus)
        break

 
>> Please input a number to choose your method of embedding.
>> 1 - Transformer based embeddings (accurate, but slower)
>> 2 - TF-IDF Vectorizer (faster, but less accurate)
 
>> Creating TF-IDF embeddings ...


In [4]:
# Set embedding as features
features = embeddings

labels = df[COL_LABEL]
labels = np.array(labels)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

In [66]:
# Selecting a model
while True:
    print(' ')
    print('>> Please input a number to choose your algorithm:')
    print('>> 1 - Logistic Regression')
    print('>> 2 - Random Forest Classifier')
    print('>> 3 - XGBoost Classifier')
    print('>> 4 - Noise Robust Deep Neural Network (work in progress)')

    model_choice = input()

    if model_choice == '1':
        print('>> Training a logistic regression ...')
        lr = LogisticRegression(dual=False)

        # Hyper parameter space is relatively small
        hyperparameters = {'C': np.arange(0, 4), 
                        'penalty': ['l2', 'None'],
                        'max_iter': [100, 150, 250, 500]}

        # Initiate and fit random search cv
        lr_clf = RandomizedSearchCV(estimator=lr, param_distributions=hyperparameters, cv=5,  n_iter=5)
        lr_clf.fit(X_train, y_train)
        y_pred = lr_clf.predict(X_test)

        # Save the model to current directory
        with open(f'Logistic Regression {dt_string}.pkl', 'wb') as fid:
            pickle.dump(lr_clf, fid) 
            print(f'>> Saved model to {os.path.abspath(os.getcwd())}') 
        break

    elif model_choice == '2':
        print('Training a random forest ...')
        rf = RandomForestClassifier()

        # Hyper parameter space is relatively small
        hyperparameters = {'max_depth': [90, 100, 110], 
                        'min_samples_leaf': [3, 4, 5]}

        # Initiate and fit random search cv
        rf_clf = RandomizedSearchCV(estimator=rf, param_distributions=hyperparameters, cv=3, n_iter=3)
        rf_clf.fit(X_train, y_train)
        y_pred = rf_clf.predict(X_test)

        # Save the model to current directory
        with open(f'Random Forest {dt_string}.pkl', 'wb') as fid:
            pickle.dump(rf_clf, fid) 
            print(f'>> Saved model to {os.path.abspath(os.getcwd())}') 
        break

    elif model_choice == '3':
        print('Training a XGBoost classifier ...')
        xg_cl = xgb.XGBClassifier(eval_metric='auc')

        # Hyperparameter space is relatively narrow
        hyperparameters = {'eta': np.arange(0.1, 0.5, 0.1), 
                        'max_depth': [4, 6, 8, 10], 
                        'n_estimators': np.arange(10, 400, 20), }

        # Initiate and fit random search cv
        xgb_clf = RandomizedSearchCV(estimator=xg_cl, param_distributions=hyperparameters, cv=3, n_iter=3)
        xgb_clf.fit(X_train, y_train)
        y_pred = xgb_clf.predict(X_test)

        # Save the model to current directory
        with open(f'XGBoost {dt_string}.pkl', 'wb') as fid:
            pickle.dump(xgb_clf, fid) 
            print(f'>> Saved model to {os.path.abspath(os.getcwd())}') 
        break

    elif model_choice == '4':
        print('Trainign neural network ...')
        X_train.astype('float16')
        X_test.astype('float16')
        y_train.astype('float16')

        # Vectorize labels
        y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
        y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

        callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)

        neural_net = Sequential()
        
        neural_net.add(Dense(64, activation='relu', input_shape=(X_train.shape[1], )))
        neural_net.add(Dropout(0.2))
        neural_net.add(Dense(64, activation='relu'))
        neural_net.add(Dropout(0.2))
        neural_net.add(Dense(32, activation='relu'))
        neural_net.add(Dense(1, activation='sigmoid'))

        neural_net.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        neural_net.fit(X_train, y_train, epochs=5, verbose=True, callbacks=[callback])

        y_pred = (neural_net.predict(X_test) > 0.5).astype("int32")
        break

    else:
        pass

 
>> Please input a number to choose your algorithm:
>> 1 - Logistic Regression
>> 2 - Random Forest Classifier
>> 3 - XGBoost Classifier
>> 4 - Noise Robust Deep Neural Network (work in progress)
Trainign neural network ...
Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


In [69]:
print(' ')
print('>> Generating evaluation metrics ...')
print(' ')
print('- - - - - - - - - - - - - - - -')
print(f'>> Model accuracy is: {round(accuracy_score(y_test, y_pred), 2) * 100} %')
print('- - - - - - - - - - - - - - - -')
print(f'>> Mean squared error is: {round(np.sqrt(mean_squared_error(y_test, y_pred)), 2)}')
print('- - - - - - - - - - - - - - - -')
print(f'>> AUC is: {round(roc_auc_score(y_test, y_pred), 2)}')
print('- - - - - - - - - - - - - - - -')
print(f'>> The confusion matrix is: {confusion_matrix(y_test, y_pred)}')

 
>> Generating evaluation metrics ...
 
- - - - - - - - - - - - - - - -
>> Model accuracy is: 98.0 %
- - - - - - - - - - - - - - - -
>> Mean squared error is: 0.16
- - - - - - - - - - - - - - - -
>> AUC is: 0.98
- - - - - - - - - - - - - - - -
>> The confusion matrix is: [[3109   79]
 [  81 3131]]
