#Classification - Functions

Authors: Gustavo FLEURY && Induraj RAMAMURTHY

Project: https://github.com/gustavofleury/Audit_Reports_NLP

In [0]:
#Libraries
import numpy as np 
import pandas as pd
import re
import nltk

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection  import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.utils import resample

from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler 

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE 

from sklearn.preprocessing import label_binarize

import gensim

# try:
#   # Use the %tensorflow_version magic if in colab.
#   %tensorflow_version 2.x
# except Exception:
#   pass

import tensorflow as tf
from tensorflow import keras

import matplotlib.style as style

import pickle

# !git clone https://github.com/facebookresearch/fastText.git
# %cd fastText
# !pip install .
# import fasttext
# import fasttext.util

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#Normalize

In [0]:
#NORMALIZE_TOKEN
stop_words = nltk.corpus.stopwords.words('portuguese')
tokenizer = RegexpTokenizer(r'\w+')
stemmer = SnowballStemmer('portuguese')

def removeUnWantedWords(text):

  # Remove numbers/dates/pages 
  cleanText = re.sub('\d', " ", text) #Remove all Numbers
  # cleanText = re.sub('\s\d+|\d+\s', " ", text)

  # Remove months
  month = "(janeiro|fevereiro|marco|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)"
  cleanText = re.sub('\s'+month+'\s', " ", cleanText)
  return cleanText.strip()

def normalize_token_corpus(corpus, 
                           stem=False,
                           stop_words_IF=True,
                           tokenize=True,
                           nbWords=0):
    
    normalized_token_corpus = []    
    for text in corpus:
        
        # Remove Unwanted Numbers/Dates/NumberPages
        text=removeUnWantedWords(text)
        
        # Tokenize the input string
        tokens = tokenizer.tokenize(text.lower())

        # Take first 2xnbWords ot tokens (try to improve performance)
        if nbWords != 0:
          nbWords2 = 2*nbWords
          tokens = tokens[:nbWords2]   

        # Remove the stop words 
        if stop_words_IF:    
          tokens = [x for x in tokens if not x in stop_words]
 
        # Perform stemming on the tokenized words 
        if stem:
          tokens = [stemmer.stem(x) for x in tokens]

        # Take first nbWords ot tokens
        if nbWords != 0:
          tokens = tokens[:nbWords]   

        if tokenize:
            normalized_token_corpus.append(tokens)
        else:
            tokens=" ".join(tokens)
            normalized_token_corpus.append(tokens)
    
    return normalized_token_corpus

In [0]:
#TF_IDF
def tf_idf(corpus,
           ngram_range=(1, 1),
           min_df=0.0,
           max_df=1.0):
  
  vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, ngram_range=ngram_range)
  feature_matrix = vectorizer.fit_transform(corpus).astype(float)

  return vectorizer, feature_matrix

def display_features(features, feature_names):
    df = pd.DataFrame(data=features,
                      columns=feature_names)
    print(df)

In [0]:
def feature_engineering(feature_eng_method, corpus, min_df, max_df, n1, n2, dimension_reduction=True, reduction_method='pca'):
  if feature_eng_method=='tf_idf':
    vectorizer = TfidfVectorizer(min_df=min_df,max_df=max_df,ngram_range=(n1,n2))
    feature_matrix = vectorizer.fit_transform(corpus).astype(float)
  
  if dimension_reduction==True:
    if reduction_method =='pca':
      feature_matrix = pca_reduction(feature_matrix)
      return vectorizer,feature_matrix
  
    if reduction_method =='tsne':
      feature_matrix= tsne_reduction(feature_matrix)
      return vectorizer, feature_matrix

  elif dimension_reduction==False:
    return vectorizer, feature_matrix    

In [0]:
def pca_reduction(feature_matrix,convert_to_array='yes'):
  pca = PCA(n_components=2, random_state=0)
  if convert_to_array=='yes':  
    feature_matrix = pca.fit_transform(feature_matrix.toarray())
  else:
    feature_matrix = pca.fit_transform(feature_matrix)
  return feature_matrix

def tsne_reduction(feature_matrix, convert_to_array='yes'):
  tsne = TSNE(n_components=2, random_state=0)
  np.set_printoptions(suppress=True)
  if convert_to_array=='yes':
    feature_matrix = tsne.fit_transform(feature_matrix.toarray())
  else:
    feature_matrix =  tsne.fit_transform(feature_matrix)
  return feature_matrix

In [0]:
# Word2Vector
# Create the model with:
def w2v_createModel(corpus_token):
  model = gensim.models.Word2Vec(corpus_token, min_count=2, window=30, size=500, sample=1e-3, iter=100)
  return model

def w2v_applyModel(model, corpus_token):  # historical - keep the compability.
  return applyModel(model, corpus_token)

def applyModel(model, corpus_token):  
  X=[]
  for sentence in corpus_token:
      X.append(sent_vectorizer(sentence, model)) 
  return X

def sent_vectorizer(sent, model):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
     
    return np.asarray(sent_vec) / numw



In [0]:
def average_word_vectors(words, model, vocabulary, num_features):
  
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [0]:
def clean_features_W2V(lfeature, lLabels):  #Delete Vectors with small size.
  nFeatures = lfeature
  nLabels = lLabels
  maxLen =  max( [len(x) for x in lfeature] )
  for i in range( len(lfeature)-1, -1, -1) : #Descendre loop for pop()
    if len(lfeature[i]) != maxLen:
      # print( i, " ", len(lfeature[i]), " ", lLabels[i] )
      nFeatures.pop(i)
      nLabels.pop(i)

  print("Length Original: ", len(lfeature), " New Length: ", len(nFeatures) )
  return nFeatures, nLabels

In [0]:
def printTokensInfo(c, ct, cts, ctw):
  print('# To see the first element LENGTH')
  print("Corpus    : " + str(len(c[0])) + " " + c[0][:100])
  print("Token     : " + str(len(ct[0])) + " " + ct[0][:100])
  print("Token Stem: " + str(len(cts[0])) + " " + cts[0][:100])
  print("Token W2V : " + str(len(cts[0])) + " " + str(cts[0][:100]) )

In [0]:
def binarize_Label_Organize( llabels ):
  llabels1 = label_binarize(llabels, classes=['MEDIUM-LOW', 'HIGH'])
  llabels2=[]
  for i in llabels1:
    llabels2.append(i[0])
  return llabels2

In [0]:
#Glove
def createGloveDict(txtFile=DATASETS_FOLDER + "glove_s300.txt", resultFile=DATASETS_FOLDER +'GloveDict.pkl'): 
  #Load from TXT file
  glove_model = {}
  with open(DATASETS_FOLDER + "glove_s300.txt", 'r') as f:
      for line in f:
          # print(line)
          values = line.split()
          word = values[0]
          try:
            vector = np.asarray(values[1:], "float32")
            glove_model[word] = vector
          except:
            pass

  #Save to Pickle File
  with open(DATASETS_FOLDER +'GloveDict.pkl', 'wb') as handle:
    pickle.dump(glove_model, handle)

#OVERSampling

In [0]:
def overSamplingRISKs(lcorpus, llabels, percentage=0.3):
  #OVERSAMPLING HIGH RISKs
  # concatenate our training data back together
  X = pd.DataFrame(list(zip(lcorpus, llabels)), 
                columns =['ConstText', 'RISK']) 

  # separate minority and majority classes
  not_HIGH = X[X.RISK!="HIGH"]
  HIGH = X[X.RISK=="HIGH"]

  # OVERsample minority
  high_upsampled = resample(HIGH,
                            replace=True, # sample with replacement
                            n_samples=int(round( len(not_HIGH)*percentage/(1-percentage) )), # match number in majority class
                            random_state=27) # reproducible results

  # combine majority and upsampled minority
  upsampled = pd.concat([not_HIGH, high_upsampled])

  #Recreate TRAIN_Corpus and TRAIN_LABELS
  lcorpus1=upsampled.ConstText.to_list()
  llabels1=upsampled.RISK.to_list()

  # check new class counts
  print( upsampled.RISK.value_counts() )

  return lcorpus1, llabels1

#Train Model

In [0]:
def get_metrics(true_labels, predicted_labels):
  acc = np.round(metrics.accuracy_score(true_labels,predicted_labels),2)
  pre = np.round(metrics.precision_score(true_labels,predicted_labels,average='binary'),2)
  rec = np.round(metrics.recall_score(true_labels,predicted_labels,average='binary'),2)
  f1s = np.round(metrics.f1_score(true_labels,predicted_labels,average='binary'),2)

  print ('Accuracy:', acc, ' Precision:', pre, ' Recall:', rec, ' F1 Score:', f1s )
  return acc, pre, rec, f1s

def train_predict_evaluate_model(classifier, train_features, train_labels1, test_features, test_labels1):
  classifier.fit(train_features, train_labels1) #model
  predictions = classifier.predict(train_features) 
  acc, pre, rec, f1s = get_metrics(train_labels1,  predictions)
  predictions = classifier.predict(test_features) 
  accTest, preTest, recTest, f1sTest = get_metrics(test_labels1,  predictions)
  return acc, pre, rec, f1s, accTest, preTest, recTest, f1sTest 


In [0]:
# LSTM
def train_LSTM(train, train_labels, test, test_labels):
  
  labels = np.asarray(train_labels )
  labels_test = np.asarray(test_labels) 

  #MODEL
  emdedding_size = 32

  model = keras.Sequential([
    keras.layers.Bidirectional( keras.layers.LSTM(units=emdedding_size, input_shape=(n_features,1)) ),
    keras.layers.Dense(emdedding_size, activation='relu'),
    keras.layers.Dense(units=2, activation='softmax')
  ]) 

  model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

  # model.summary()

  model.fit(train, labels, epochs=50, validation_data=(test, labels_test))

  return model


def train_LSTM_w2v(w2v_model, train, train_labels, test, test_labels):

  # train = train_features - np.min(train_features)
  # test  = test_features - np.min(train_features)
  labels = np.asarray(train_labels )
  labels_test = np.asarray(test_labels)

  #MODEL
  n_cols = train.shape[1]
  pretrained_weights = w2v_model.wv.syn0
  vocab_size, emdedding_size = pretrained_weights.shape

  model = keras.Sequential([
    keras.layers.Embedding( input_dim = vocab_size,
        output_dim = emdedding_size,
        weights=[pretrained_weights],
        input_length=emdedding_size,
        input_shape=(n_cols, ), 
        trainable=False,
        ), 
    keras.layers.Bidirectional( keras.layers.LSTM(units=emdedding_size, dropout=0.2, recurrent_dropout=0.2) ),
    # keras.layers.Bidirectional( keras.layers.LSTM(units=emdedding_size) ),
    keras.layers.Dense(emdedding_size, activation='relu'),
    # Dropout-
    keras.layers.Dense(units=2, activation='softmax')
  ]) 

  model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

  model.fit(train, labels, epochs=50, validation_data=(test, labels_test))

  return model

def predict_evaluate_LSTM(model, train_features, train_labels, test_features, test_labels):
  # Use Tensorflow to take advantage of GPU.
  #Transform im NP Arrays 
  # train = train_features - np.min(train_features)
  # test  = test_features - np.min(train_features)
  # labels = np.asarray(train_labels )
  # test_labels = np.asarray(test_labels)
  
  predictionsPer = model.predict(train)
  predictions = np.asarray( [ np.argmax(x) for x in predictionsPer] )
  acc, pre, rec, f1s = get_metrics(labels,  predictions)
  
  predictionsPer = model.predict(test)
  predictions = np.asarray( [ np.argmax(x) for x in predictionsPer] )
  accTest, preTest, recTest, f1sTest = get_metrics(test_labels,  predictions)

  return acc, pre, rec, f1s, accTest, preTest, recTest, f1sTest 

In [0]:
# Train/Execute the ML Models
def collect_Metrics_ML_Models(FEATURE_TYPE, FEATURE_TRAIN, LABELS_TRAIN, FEATURE_TEST, LABELS_TEST):
  l_metrics=[]

  #Naive Bayes
  scaler = MinMaxScaler()
  scaler.fit(FEATURE_TRAIN)
  feature_matrix_TRAIN_NB = scaler.transform( FEATURE_TRAIN )

  nbc = MultinomialNB(alpha=0.01)
  print('Naive Bayes:')
  acc, pre, rec, f1s, accTest, preTest, recTest, f1sTest = train_predict_evaluate_model(nbc, feature_matrix_TRAIN_NB, LABELS_TRAIN, FEATURE_TEST, LABELS_TEST )
  l_metrics.append([FEATURE_TYPE,'Naive Bayes',acc,pre,rec,f1s, accTest, preTest, recTest, f1sTest])

  #Logistic Regression
  lgr = LogisticRegression(max_iter=1000)
  print('Logistic Regression:')
  acc, pre, rec, f1s, accTest, preTest, recTest, f1sTest = train_predict_evaluate_model(lgr, FEATURE_TRAIN, LABELS_TRAIN, FEATURE_TEST, LABELS_TEST )
  l_metrics.append([FEATURE_TYPE,'Logistic Regression',acc,pre,rec,f1s, accTest, preTest, recTest, f1sTest])

  #SVM
  svm = SGDClassifier()
  print('SVM:')
  acc, pre, rec, f1s, accTest, preTest, recTest, f1sTest = train_predict_evaluate_model(svm, FEATURE_TRAIN, LABELS_TRAIN, FEATURE_TEST, LABELS_TEST )
  l_metrics.append([FEATURE_TYPE,'SVM',acc,pre,rec,f1s, accTest, preTest, recTest, f1sTest])

  #RandomForest
  rfc = RandomForestClassifier(n_estimators=10)
  print('RandomForest:')
  acc, pre, rec, f1s, accTest, preTest, recTest, f1sTest = train_predict_evaluate_model(rfc, FEATURE_TRAIN, LABELS_TRAIN, FEATURE_TEST, LABELS_TEST )
  l_metrics.append([FEATURE_TYPE,'RandomForest',acc,pre,rec,f1s, accTest, preTest, recTest, f1sTest])

  #NN
  nnc = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), max_iter=200, random_state=1)
  print('NN: ')
  acc, pre, rec, f1s, accTest, preTest, recTest, f1sTest = train_predict_evaluate_model(nnc, FEATURE_TRAIN, LABELS_TRAIN, FEATURE_TEST, LABELS_TEST )
  l_metrics.append([FEATURE_TYPE,'NN',acc,pre,rec,f1s, accTest, preTest, recTest, f1sTest])

  # #LSTM
  # print('LSTM: ')
  # acc, pre, rec, f1s, accTest, preTest, recTest, f1sTest = predict_evaluate_LSTM(LSTM_MODEL, FEATURE_TRAIN, LABELS_TRAIN, FEATURE_TEST, LABELS_TEST )
  # l_metrics.append([FEATURE_TYPE,'NN',acc,pre,rec,f1s, accTest, preTest, recTest, f1sTest])
  
  return pd.DataFrame(l_metrics,columns=['Features','MLMethod','TRAIN-Accuracy','TRAIN-Precision','TRAIN-Recall','TRAIN-F1Score', 'TEST-Accuracy','TEST-Precision','TEST-Recall','TEST-F1Score'])