# Companies Dataset with Google & FastText Vectors
This notebook includes the code applied to articles regarding companies

In [None]:
# run once cell - then restart kernel
!pip install --upgrade tensorflow 
!pip install gensim 
!pip install -q -U keras-tuner
!pip install scikeras[tensorflow]

## Import Relevant Libraries

In [None]:
# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy, binary_crossentropy
from tensorflow.keras.activations import relu, softmax
import keras_tuner as kt
from scikeras.wrappers import KerasClassifier # To use keras with sklearn 
tf.__version__

In [None]:
# Plotting libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np  

# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

# Model33s - sklearn
import pickle # to save the models 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import (recall_score, precision_score, precision_recall_curve,
                             fbeta_score, make_scorer)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.utils.class_weight import compute_class_weight 

# gensim 
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.fasttext import FastText

# Load nltk library for tokenization
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

# Garbage collector
import gc
import time

In [None]:
# Company adverse media train, test, and validation datasets
org_train = pd.read_csv("datasets/Org_train.csv")
org_test = pd.read_csv("datasets/Org_test.csv")
org_valid = pd.read_csv("datasets/Org_valid.csv")

## Data Cleaning
Removing columns containing null values

In [None]:
# Drop unspecified columns that appear to be irrelevant in the datasets
org_train.drop('Unnamed: 0', axis=1, inplace=True)
org_test.drop(['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1'], axis=1, inplace=True)
org_valid.drop('Unnamed: 0', axis=1, inplace=True)

## Sentence Tokenization
Tokenize text in train, test, and validation sets. Lower capitalized letters, remove stopwords and non-alphabetic characters.

In [None]:
%%time 
from func_ import preprocess
train_corpus = [preprocess(potato) for potato in org_train['raw_sentence']]
test_corpus = [preprocess(text) for text in org_test['raw_sentence']]
valid_corpus = [preprocess(text) for text in org_valid['raw_sentence']]

## Store tokenized sentences in new column called 'token_sentence'.

In [None]:
datasets = [org_train, org_test, org_valid] 
corpus = [train_corpus, test_corpus, valid_corpus]

for (df,corpus) in zip(datasets,corpus):
    df['token_sentence'] = pd.NaT # Create new column to store tokenized sentences
    tok_column = df.pop('token_sentence')
    df.insert(0,'token_sentence', tok_column) # Shift column to first position in df
    df['token_sentence'] = corpus

In [None]:
# for deep learning
y_train = pd.DataFrame(org_train['class'])
y_valid = pd.DataFrame(org_valid['class'])
y_test = pd.DataFrame(org_test['class'])

In [None]:
# for deep learning
for labels in [y_train, y_valid, y_test]:
    for index, row in labels.itertuples():
        if row == 'negative':
            labels.loc[index] = 0
        else:
            labels.loc[index] = 1

In [None]:
# Create variables for types of text in the df for ease of use - for deep learning
X_train = org_train['token_sentence']
X_train_raw = org_train['raw_sentence']
y_train = y_train['class']

X_test = org_test['token_sentence']
X_test_raw = org_test['raw_sentence']
y_test = y_test['class']

X_valid = org_valid['token_sentence']
X_valid_raw = org_valid['raw_sentence']
y_valid = y_valid['class']

In [None]:
# Create variables for types of text in the df for ease of use
X_train = org_train['token_sentence']
X_train_raw = org_train['raw_sentence']
y_train = org_train['class']

X_test = org_test['token_sentence']
X_test_raw = org_test['raw_sentence']
y_test = org_test['class']

X_valid = org_valid['token_sentence']
X_valid_raw = org_valid['raw_sentence']
y_valid = org_valid['class']

## Google News pre-trained Vectors

In [None]:
Model = gensim.models.KeyedVectors.load_word2vec_format(
    'datasets/GoogleNews-vectors-negative300.bin.gz', binary=True,)

## FastText pre-trained vectors

In [None]:
Model = gensim.models.KeyedVectors.load_word2vec_format(
    'datasets/GoogleNews-vectors-negative300.bin.gz', binary=True,)

## Further Data Cleaning
Remove rows that contain less than 2 words. That is rows with only 1 or 0 words.

In [None]:
%%time
from func_ import filter_docs
trainsets = [X_train, X_test, X_valid]
testsets = [y_train, y_test, y_valid]
filter_docs(trainsets, testsets, lambda text: (len(text)<2))

Remove sentences that are not included in the google vecs dictionary. These are sentences that include only words that are not found in the Google News pre-trained vectors

In [None]:
from func_ import filter_docs, has_vector_representation
filter_docs(trainsets, testsets, lambda text: has_vector_representation(Model, text))

## Plain average vectors for each sentence

In [None]:
%%time
# Average vectors for each sentence
from func_ import average_vecs
trainVecs = [average_vecs(sentence, Model, 300) for sentence in X_train]
testVecs = [average_vecs(sentence, Model, 300) for sentence in X_test]
validVecs = [average_vecs(sentence, Model, 300) for sentence in X_valid]

## Data Imbalance
Find how imbalanced our datasets are between the two classes.

In [None]:
# Number of instances for each class in each dataset
datasets={'Train Set':y_train, 'Test Set':y_test, 'Valid Set':y_valid}

for dataset, data in datasets.items():
    labels, counts = np.unique(data, return_counts=True)
    neg = counts[0]
    pos = counts[1]
    total = neg + pos
    print('{} Examples:\n    Negative: {}\n    Positive: {} ({:.2f}% of total)\n'.format(dataset,
        neg, pos, 100 * pos / total))

## Stemming
Apply stemming on each sentence. Then calculate their average vectors

In [None]:
%%time
from func_ import stemming
stem_train = [stemming(sentence) for sentence in X_train]
stem_test = [stemming(sentence) for sentence in X_test]
stem_valid = [stemming(sentence) for sentence in X_valid]

In [None]:
%%time
from func_ import average_vecs
trainVecs = [average_vecs(sentence, Model, 300) for sentence in stem_train]
testVecs = [average_vecs(sentence, Model, 300) for sentence in stem_test]
validVecs = [average_vecs(sentence, Model, 300) for sentence in stem_valid]

## Lemmatization
Apply lemmatization on each sentence. Then Calculate their average vecs

In [None]:
%%time
from func_ import lemmatization 
lem_train = [lemmatization(sentence) for sentence in X_train]
lem_test = [lemmatization(sentence) for sentence in X_test]
lem_valid = [lemmatization(sentence) for sentence in X_valid]

In [None]:
%%time
from func_ import average_vecs
trainVecs = [average_vecs(sentence, Model, 300) for sentence in lem_train]
testVecs = [average_vecs(sentence, Model, 300) for sentence in lem_test]
validVecs = [average_vecs(sentence, Model, 300) for sentence in lem_valid]

## Count Vectorizer
Apply CountVectorizer() and TfidfVectorizer() with DecisionTreeClassifier & SVM

In [None]:
vec = CountVectorizer(stop_words='english')
train_countvec = vec.fit_transform(X_train_raw)
test_countvec = vec.transform(X_test_raw)
valid_countvec = vec.transform(X_valid_raw)

### Decision tree with Random Grid Search

In [None]:
%%time
from func_ import classifiers_gs
model_list = [DecisionTreeClassifier()]
model_names = ['DTC_countvec_rgs']

grids = [{
    'random_state': [42],
    'max_depth': range(5,1000,5),
    'max_features': ['sqrt','log2'],
    'min_samples_leaf': range(1,36),
    'min_samples_split': range(1,26)
    }]

results, best_models, timer = classifiers_gs(model_list, model_names, grids, train_countvec, test_countvec, valid_countvec, y_train, y_test, y_valid)


## SVM with Grid Search
Applying 9 specific combinations of parameters as the model takes too long to converge

In [None]:
%%time
from func_ import classifiers_gs_svm
# Default parameters of SVM are C: 1.0, gamma: scale  
model = SVC(probability=True)
model_name = "SVM_countvec_rgs_"

grids = [{'gamma': [4]},
         {'gamma': [6]},
         {'gamma': [8]},
         {'C': [20]},
         {'C': [50]},
         {'C': [100]},
         {'gamma': [4], 'C': [20]},
         {'gamma': [6], 'C': [50]},
         {'gamma': [8], 'C': [100]}]


results = classifiers_gs_svm(model, model_name, grids, train_countvec, test_countvec, valid_countvec, y_train, y_test, y_valid)
results

## Tfidf Vectorizer

In [None]:
%%time
from func_ import countvec
vec = TfidfVectorizer(stop_words='english')
model_list = [DecisionTreeClassifier, SVC]
countvec(vec, model_list, X_train_raw,X_test_raw,X_valid_raw,y_train,y_test,y_valid)

## Hashing Vectorizer
Apply HashingVectorizer() with DecisionTreeClassifier. Then compare recall_score on test and validation sets

In [None]:
%%time
from func_ import hashvec
model_list = [DecisionTreeClassifier, SVC]
hashvec(model_list, X_train_raw,X_test_raw,X_valid_raw,y_train,y_test,y_valid,300)

## Plain Models

### Decision Tree Classifier & SVM

In [None]:
%%time
from func_ import classifiers
model_list = [DecisionTreeClassifier(random_state=42)]
model_names = ['DecisionTreeClassifier']

results, precision_test, recall_test, precision_valid, recall_valid = classifiers(model_list, model_names, trainVecs, testVecs, validVecs, y_train, y_test, y_valid)

### SVM

In [None]:
%%time
from func_ import classifiers
model_list = [SVC(random_state=42, probability=True)]
model_names = ['SVM']

results, precision_test, recall_test, precision_valid, recall_valid = classifiers(model_list, model_names, trainVecs, testVecs, validVecs, y_train, y_test, y_valid)

### Decision Tree Classifier with random grid search

In [None]:
%%time
from func_ import classifiers_gs
model_list = [DecisionTreeClassifier]
model_names = ['DecisionTreeClassifier']

grids = [{
    'random_state': [42],
    'max_depth': range(5,1000,5),
    'max_features': ['sqrt','log2', None],
    'min_samples_leaf': range(1,36),
    'min_samples_split': range(1,26)
    }]

results, best_models, timer = classifiers_gs(model_list, model_names, grids, trainVecs, testVecs, validVecs, y_train, y_test, y_valid)

### SVM with Grid Search
Applying 7 specific combinations of parameters as the model takes too long to converge

In [None]:
%%time
from func_ import svm_grid_search
model_name = "SVM_gs_lem"

grids = [
    {'gamma': 0.1, 'C': 10.0},
    {'gamma': 0.2, 'C': 20.0},
    {'gamma': 0.3, 'C': 30.0},
    {'gamma': 0.4, 'C': 40.0},
    {'gamma': 0.5, 'C': 50.0},
    {'gamma': 0.6, 'C': 70.0},
    {'gamma': 0.7, 'C': 100.0}]

results = svm_grid_search(trainVecs, validVecs, y_train, y_valid, model_name, grids)
results

In [None]:
results.to_csv('results_gs_plain.csv', index=False)

## Deep Learning

In [None]:
# convert data to arrays
Xtrain = np.stack(trainVecs)
Xvalid = np.stack(validVecs)
Xtest = np.stack(testVecs)
y_train = np.array(y_train).astype('float32')
y_valid = np.array(y_valid).astype('float32')
y_test = np.array(y_test).astype('float32')

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
neg, pos = counts[0], counts[1] # get number of positive and negative values in training set
output_bias = np.log([pos/neg])
output_bias # correct initial bias according to: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data

In [None]:
class_weight = compute_class_weight(class_weight = "balanced", classes= np.unique(y_train), y=y_train)
class_weight = {0: class_weight[0], 1: class_weight[1]}

## Fit data

In [None]:
from func_ import build_gs_nn
params_grid = dict(
    layers = [2, 3, 4],
    neurons = [200, 300, 400, 500],
    learning_rate = [1e-2, 1e-3],
    dropout_rates = [0.5, 0.6, 0.7],
    epochs = [2, 5, 8, 10],
    batch_sizes = [350, 450, 600, 1000]
    ) # 1152 models

results_lem = build_gs_nn(Xtrain, y_train, Xvalid, y_valid, Xtest, y_test, params_grid)

In [None]:
results_lem.to_csv('results_lem.csv', index=False)

In [None]:
results_lem.sort_values(by='99%', ascending=False).head()

In [None]:
results_lem['Average'] = pd.NaT
for n_row, row in enumerate(results_lem):
    results_lem['Average'].iloc[n_row] = (results_lem['90%'].iloc[n_row]*0.05 + results_lem['95%'].iloc[n_row]*0.05 + results_lem['99%'].iloc[n_row]*0.9)/3

In [None]:
results_lem.sort_values(by="Average", ascending=False).head()

In [None]:
recall_scores = [0.90, 0.95, 0.99]
for recall in recall_scores:
    for row, score in enumerate(recall_test):
        if round(score, 2) == recall:
            print(f"Precision at {recall}% is {precision_test[row]}")
            break

In [None]:
results = pd.read_csv("results.csv")
results.sort_values(by='99%', ascending=False)[:10]

In [None]:
results.shape

In [None]:
performance = pd.DataFrame(index=range(1), columns=["90%", "95%", "99%"])
recall_scores = [0.90, 0.95, 0.99]
counter=0
for recall in recall_scores:
    for row, score in enumerate(recall_test):
        if round(score, 2) == recall:
            performance.loc[counter, str(int(recall*100))+"%"] = precision_test[row]
            break
performance

In [None]:
# Get parameters of a model
model_filename='SVM_avg_model.sav'
model = pickle.load(open(model_filename, 'rb'))
model.get_params()