In [0]:
#Libraries
import numpy as np 
import pandas as pd
import re
import nltk

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection  import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.utils import resample

from sklearn.preprocessing import label_binarize

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE 
from sklearn.decomposition import TruncatedSVD

import gensim

import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**To Clean, Normalize (text preprocessing)**

In [0]:
stop_words = nltk.corpus.stopwords.words('portuguese')
tokenizer = RegexpTokenizer(r'\w+')
stemmer = SnowballStemmer('portuguese')
#word_token_final_list_st =[]

def cleantext(text):
  lean1Text = re.sub('\(.*?\)','',text)
  month = "(janeiro|fevereiro|marco|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)"
  #lean1Text = re.sub('\d{1, }\sde\s'+month+'\sde\s\d{2,}','',lean1Text)
  lean1Text = re.sub('(\s\d{1,}\sde\s'+month+'\sde\s\d{4})', ' ', lean1Text)
  lean1Text = re.sub('(\d{1,}\sde\s'+month+'\sde\s\d{4})', ' ', lean1Text)
  lean1Text = re.sub('(\sde\s'+month+'\sde\s\d{4})', ' ', lean1Text)
  lean1Text = re.sub('(\s'+month+'\sde\s\d{4})', ' ', lean1Text)
  lean1Text = re.sub('mil',' ',lean1Text)
  lean1Text = re.sub('r\$[\s+]?[\d+]?[\.]?[\,]?[\d{1,}]?[\.]?[\d{1,}]?[\.]?[\,]?[\d{1,}]?',' ',lean1Text)
  lean1Text = re.sub('\(.*?\)',' ',lean1Text)
  lean1Text = re.sub('\sde[\s{1,}]?de\s', ' ',lean1Text)
  lean1Text = re.sub('\sno[\s]?[\d]{1,}\/[\d]{1,}',' ',lean1Text)
  lean1Text = re.sub('\s\d\.\s',' ',lean1Text)
  lean1Text = re.sub('\s\w\.\s',' ',lean1Text)
  lean1Text = re.sub('\sno[\s]?[\d]{1,}[^\w]?[\d]{1,}[\S\W\D]?[\d]{1,}',' ',lean1Text)
  lean1Text = re.sub('\sn[\.]?[\s]?[\d]{1,}[^\w]?[\d]{1,}[\S\W\D]?[\d]{1,}',' ',lean1Text)
  lean1Text = re.sub('de[\s][\.][\s]lei[\s][\W][\D]de[\s][\W][\D]art[\.]' , '\sde\s\lei\sart\s' ,lean1Text)
  lean1Text = re.sub('[\s][\.][\s]lei[\s][\W][\D]de[\s][\W][\D]art[\.]','\slei\s\de\sart\s',lean1Text)
  lean1Text = re.sub('(\s[a-zA-Z]\)\s)|(\s[a-zA-Z]\-\s)',' ',lean1Text)
  lean1Text = re.sub('(http[s]?...\w*.?\w*\.?\w*.*?\s)|[^\w\s{1}\.\?\!]|[\d]',' ',lean1Text)
  lean1Text = re.sub('(\si{1,}[v]?[\W])|(\svi{1,}[\W])|(\six[\W])|(xi{1,}[\W])|(x{1,}[\W])',' ',lean1Text)
  lean1Text = re.sub('\s\v\s',' ',lean1Text)
  lean1Text = re.sub('\sde\s[\W]',' ',lean1Text)
  lean1Text = re.sub( '\s[\W]\d{1,}[\W]\d{1,}[\W]',' ',lean1Text)
  lean1Text = re.sub( '\sarea\s{1,}[\W]',' ',lean1Text)
  lean1Text = re.sub('\scplc\sdepcon\su\spgf\sagu\s',' ',lean1Text)
  lean1Text = re.sub('\slei\sn\.o\s',' ',lean1Text)
  lean1Text=re.sub('(\sm2\s)|(\szm\s)',' ',lean1Text)
  lean1Text= re.sub('\.\.+' , '.',lean1Text)
  lean1Text= re.sub('\.+', '.', lean1Text)
  lean1Text= re.sub('\s\.+','.',lean1Text)
  lean1Text=re.sub('\s\.','.',lean1Text)
  lean1Text=re.sub('\s\.\s{1,}','.',lean1Text)
  lean1Text=re.sub('(\.em\s)|(\sem\.)','\sem\s',lean1Text)
  lean1Text=re.sub('\s{2,}',' ',lean1Text)
  lean1Text=re.sub('\.{2,}',' ',lean1Text)
  lean1Text=re.sub('\s\.\s{1,}','.',lean1Text)
  lean1Text=re.sub('\.\s\.','.',lean1Text)
  lean1Text=re.sub('(\skm\sa\s)|(\se\skm\s)|(\sao\skm\s)|(\ssendo\skm\s)|(\sao\skm)|(\se\skm)|(\sao\skm)',' ',lean1Text)

  return lean1Text.strip()

def tokenize_word(cleaned_list):
  tokens_class=nlp(cleaned_list)                                    # https://spacy.io/models/pt          # token.text , token.pos_   , token.dep_      #https://lars76.github.io/nlp/lemmatize-portuguese/   (why lemmatization is bad in portugesse case)
  token_list=[token.text.strip() for token in tokens_class if str(token)!=' ']
  return token_list

def normalize(corpus,stem=True,no_stem=True):
  word_token_final_list_stemmed =[]
  word_token_final_list_non_stemmed=[]
  for sentence in corpus:
    sentence= sentence[0:1000000]
    sentence= sentence.lower()
    cleaned_list=cleantext(sentence)
    token_list_main=tokenize_word(cleaned_list)
    without_stop_word_list = [x for x in token_list_main if x not in stop_words]
    new_pattern= r'(\s\w\s)|(^\w\s)'
    
    if stem==True and no_stem==False:
      stemmed_list = [stemmer.stem(x) for x in without_stop_word_list]
      stemmed_text = ' '.join(stemmed_list)
      
      stemmed_text_final= re.sub(new_pattern,'',stemmed_text)
      word_token_final_list_stemmed.append(stemmed_text_final)
      word_token_final_list_non_stemmed.append('NaN')
    
    elif stem==False and no_stem==True:
      without_stop_word_text = ' '.join(element for element in without_stop_word_list)
      non_stemmed_text_final= re.sub(new_pattern,'',without_stop_word_text)
      word_token_final_list_non_stemmed.append(non_stemmed_text_final)
      word_token_final_list_stemmed.append('NaN')
     
    else:
      stemmed_list = [stemmer.stem(x) for x in without_stop_word_list]
      stemmed_text = ' '.join(stemmed_list)
      new_pattern= r'(\s\w\s)|(^\w\s)'
      stemmed_text_final= re.sub(new_pattern,'',stemmed_text)
      word_token_final_list_stemmed.append(stemmed_text_final)

      without_stop_word_text = ' '.join(element for element in without_stop_word_list)
      non_stemmed_text_final= re.sub(new_pattern,'',without_stop_word_text)
      word_token_final_list_non_stemmed.append(non_stemmed_text_final)

  return word_token_final_list_stemmed, word_token_final_list_non_stemmed

Sentence tokenizer

In [0]:
from nltk.tokenize import sent_tokenize
stop_words = nltk.corpus.stopwords.words('portuguese')
#tokenizer = RegexpTokenizer(r'\w+')
#stemmer = SnowballStemmer('portuguese')
#word_token_final_list_st =[]

def cleantext1(text):
  lean1Text = re.sub('\(.*?\)','',text)
  month = "(janeiro|fevereiro|marco|abril|maio|junho|julho|agosto|setembro|outubro|novembro|dezembro)"
  #lean1Text = re.sub('\d{1, }\sde\s'+month+'\sde\s\d{2,}','',lean1Text)
  lean1Text = re.sub('(\s\d{1,}\sde\s'+month+'\sde\s\d{4})', ' ', lean1Text)
  lean1Text = re.sub('(\d{1,}\sde\s'+month+'\sde\s\d{4})', ' ', lean1Text)
  lean1Text = re.sub('(\sde\s'+month+'\sde\s\d{4})', ' ', lean1Text)
  lean1Text = re.sub('(\s'+month+'\sde\s\d{4})', ' ', lean1Text)
  lean1Text = re.sub('mil',' ',lean1Text)
  lean1Text = re.sub('r\$[\s+]?[\d+]?[\.]?[\,]?[\d{1,}]?[\.]?[\d{1,}]?[\.]?[\,]?[\d{1,}]?',' ',lean1Text)
  lean1Text = re.sub('\(.*?\)',' ',lean1Text)
  lean1Text = re.sub('\sde[\s{1,}]?de\s', ' ',lean1Text)
  lean1Text = re.sub('\sno[\s]?[\d]{1,}\/[\d]{1,}',' ',lean1Text)
  lean1Text = re.sub('\s\d\.\s',' ',lean1Text)
  lean1Text = re.sub('\s\w\.\s',' ',lean1Text)
  lean1Text = re.sub('\sno[\s]?[\d]{1,}[^\w]?[\d]{1,}[\S\W\D]?[\d]{1,}',' ',lean1Text)
  lean1Text = re.sub('\sn[\.]?[\s]?[\d]{1,}[^\w]?[\d]{1,}[\S\W\D]?[\d]{1,}',' ',lean1Text)
  lean1Text = re.sub('de[\s][\.][\s]lei[\s][\W][\D]de[\s][\W][\D]art[\.]' , '\sde\s\lei\sart\s' ,lean1Text)
  lean1Text = re.sub('[\s][\.][\s]lei[\s][\W][\D]de[\s][\W][\D]art[\.]','\slei\s\de\sart\s',lean1Text)
  lean1Text = re.sub('(\s[a-zA-Z]\)\s)|(\s[a-zA-Z]\-\s)',' ',lean1Text)
  lean1Text = re.sub('(http[s]?...\w*.?\w*\.?\w*.*?\s)|[^\w\s{1}\.\?\!]|[\d]',' ',lean1Text)
  lean1Text = re.sub('(\si{1,}[v]?[\W])|(\svi{1,}[\W])|(\six[\W])|(xi{1,}[\W])|(x{1,}[\W])',' ',lean1Text)
  lean1Text = re.sub('\s\v\s',' ',lean1Text)
  lean1Text = re.sub('\sde\s[\W]',' ',lean1Text)
  lean1Text = re.sub( '\s[\W]\d{1,}[\W]\d{1,}[\W]',' ',lean1Text)
  lean1Text = re.sub( '\sarea\s{1,}[\W]',' ',lean1Text)
  lean1Text = re.sub('\scplc\sdepcon\su\spgf\sagu\s',' ',lean1Text)
  lean1Text = re.sub('\slei\sn\.o\s',' ',lean1Text)
  lean1Text=re.sub('(\sm2\s)|(\szm\s)',' ',lean1Text)
  lean1Text= re.sub('\.\.+' , '.',lean1Text)
  lean1Text= re.sub('\.+', '.', lean1Text)
  lean1Text= re.sub('\s\.+','.',lean1Text)
  lean1Text=re.sub('\s\.','.',lean1Text)
  lean1Text=re.sub('\s\.\s{1,}','.',lean1Text)
  lean1Text=re.sub('(\.em\s)|(\sem\.)','\sem\s',lean1Text)
  lean1Text=re.sub('\s{2,}',' ',lean1Text)
  lean1Text=re.sub('\.{2,}',' ',lean1Text)
  lean1Text=re.sub('\s\.\s{1,}','.',lean1Text)
  lean1Text=re.sub('\.\s\.','.',lean1Text)
  lean1Text=re.sub('(\skm\sa\s)|(\se\skm\s)|(\sao\skm\s)|(\ssendo\skm\s)|(\sao\skm)|(\se\skm)|(\sao\skm)',' ',lean1Text)

  return lean1Text.strip()

def tokenize_word1(cleaned_list):
  tokens_class=nlp(cleaned_list)                                    # https://spacy.io/models/pt          # token.text , token.pos_   , token.dep_      #https://lars76.github.io/nlp/lemmatize-portuguese/   (why lemmatization is bad in portugesse case)
  token_list=[token.text.strip() for token in tokens_class if str(token)!=' ']
  return token_list

def normalize1(corpus):
  c=0;
  word_token_final_list_sent_tokenized=[]
  for sentence in corpus:
    sentence= sentence[0:1000000]
    sentence= sentence.lower()
    cleaned_list=cleantext1(sentence)
    sentence_tokenized_list = sent_tokenize(cleaned_list)
    with_stop_final_text = ' '.join(sentence_tokenized_list)
    word_token_final_list_sent_tokenized.append(with_stop_final_text)
  return word_token_final_list_sent_tokenized

In [0]:
def pca_reduction(feature_matrix,convert_to_array='yes'):
  pca = PCA(n_components=2, random_state=0)
  if convert_to_array=='yes':  
    feature_matrix = pca.fit_transform(feature_matrix.toarray())
  else:
    feature_matrix = pca.fit_transform(feature_matrix)
  return feature_matrix

def tsne_reduction(feature_matrix, convert_to_array='yes'):
  tsne = TSNE(n_components=2, random_state=0)
  np.set_printoptions(suppress=True)
  if convert_to_array=='yes':
    feature_matrix = tsne.fit_transform(feature_matrix.toarray())
  else:
    feature_matrix =  tsne.fit_transform(feature_matrix)
  return feature_matrix

def svd(feature_matrix, convert_to_array='yes'):
  svd = TruncatedSVD(n_components=2, random_state=0)
  np.set_printoptions(suppress=True)
  if convert_to_array=='yes':
    feature_matrix = svd.fit_transform(feature_matrix.toarray())
  else:
    feature_matrix =  svd.fit_transform(feature_matrix)
  return feature_matrix

**TFIDF**

In [0]:
def feature_engineering(feature_eng_method, corpus, min_df, max_df, n1, n2, dimension_reduction=True, reduction_method='pca'):
  if feature_eng_method=='tf_idf':
    vectorizer = TfidfVectorizer(min_df=min_df,max_df=max_df,ngram_range=(n1,n2))
    feature_matrix = vectorizer.fit_transform(corpus).astype(float)
  
  if dimension_reduction==True:
    if reduction_method =='pca':
      feature_matrix = pca_reduction(feature_matrix)
      return vectorizer,feature_matrix
  
    if reduction_method =='tsne':
      feature_matrix= tsne_reduction(feature_matrix)
      return vectorizer, feature_matrix

    if reduction_method == 'svd':
      feature_matrix= svd(feature_matrix)
      return vectorizer, feature_matrix
  elif dimension_reduction==False:
    return vectorizer, feature_matrix      

**W2V**

Avgerage w2v and tfidf w2v

In [0]:
def average_word_vectors(words, model, vocabulary, num_features):
  
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


Unsupervised 

In [0]:
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from nltk.cluster import KMeansClusterer , euclidean_distance
from sklearn.cluster import DBSCAN
from sklearn.cluster import Birch
from sklearn.cluster import MiniBatchKMeans

def ml_unsupervised(method,feature_matrix,numb_cluster=3):
  if method=='kmeans_scikit':
    km = KMeans(n_clusters=numb_cluster,max_iter=100,random_state=12)
    km.fit(feature_matrix)
    clusters = km.labels_
    return km, clusters
  if method=='kmeans_nltk':
    #km = KMeansClusterer(numb_cluster, distance=nltk.cluster.util.cosine_distance, repeats=25,avoid_empty_clusters=True)
    km = KMeansClusterer(numb_cluster, distance=euclidean_distance, repeats=25,avoid_empty_clusters=True)
    clusters = km.cluster(feature_matrix, assign_clusters=True)
    return km, clusters
  # if method=='affinity_propogation':
  #   ap= AffinityPropagation(n_clusters=numb_cluster)
  #   ap.fit(feature_matrix)
  #   clusters=ap.labels_
  #   return ap, clusters
  # if method=='dbscan':
  #   db=DBSCAN(eps=0.5, metric='euclidean', min_samples=10)
  #   db.fit(feature_matrix)
  #   clusters= db.labels_
  #   return db,clusters
  if method =='minibatchkmeans':
    mb=MiniBatchKMeans(n_clusters=numb_cluster,random_state=12)
    mb.fit(feature_matrix)
    clusters=mb.labels_
    return mb, clusters

    





Metrices

In [0]:
def get_metrics(true_labels, predicted_labels):
  acc = np.round(metrics.accuracy_score(true_labels,predicted_labels),4)
  pre = np.round(metrics.precision_score(true_labels,predicted_labels,average='binary'),4)
  rec = np.round(metrics.recall_score(true_labels,predicted_labels,average='binary'),4)
  f1s = np.round(metrics.f1_score(true_labels,predicted_labels,average='binary'),4)

  print ('Accuracy:', acc, ' Precision:', pre, ' Recall:', rec, ' F1 Score:', f1s )
  return acc, pre, rec, f1s

def metrics_dataframe(cols):
  Type =[]
  Acc =[]
  Precision=[]
  Recall =[]
  f1 =[]

  def get_metrics(true_labels, predicted_labels, ele):
    Type.append(ele)
    Acc.append(np.round(metrics.accuracy_score(true_labels,predicted_labels),2))
    Precision.append(np.round(metrics.precision_score(true_labels,predicted_labels,average='binary'),2))
    Recall.append(np.round(metrics.recall_score(true_labels,predicted_labels,average='binary'),2))
    f1.append(np.round(metrics.f1_score(true_labels,predicted_labels,average='binary'),2))

  for ele in cols: 
    get_metrics(binarized_risk_label, df[ele] , ele )

  return Type,Acc,Precision,Recall,f1



In [0]:
def custom_plot(feature_matrix, cluster_name):
  n = feature_matrix.shape[0]
  fig = plt.figure()
  fig.set_size_inches(5,5)
  ax1=fig.add_subplot(1,1,1)
  for i in range(0, len(feature_matrix)):
    if (df.iloc[i]['LABEL']=='Sem impacto negativo') or (df.iloc[i]['LABEL']=='Médio') or (df.iloc[i]['LABEL']=='Baixo') or (df.iloc[i]['LABEL']=='Falha Média') or (df.iloc[i]['LABEL']=='Falha Formal') and (df.iloc[i][cluster_name]==0):
      c1,=ax1.plot(feature_matrix[i,0],feature_matrix[i,1],c='b',marker='+')
    elif (df.iloc[i]['LABEL'] =='Alto') or (df.iloc[i]['LABEL']=='Muito Alto') and (df.iloc[i][cluster_name]==1):
      c2,=ax1.plot(feature_matrix[i,0],feature_matrix[i,1],c='r',marker='o')
plt.show()

In [0]:
def overSamplingRISKs(lcorpus, llabels):
  #OVERSAMPLING HIGH RISKs
  # concatenate our training data back together
  X = pd.DataFrame(list(zip(lcorpus, llabels)), 
                columns =['ConstText', 'RISK']) 

  # separate minority and majority classes
  not_HIGH = X[X.RISK!="HIGH"]
  HIGH = X[X.RISK=="HIGH"]

  # OVERsample minority
  high_upsampled = resample(HIGH,
                            replace=True, # sample with replacement
                            n_samples=len(not_HIGH), # match number in majority class
                            random_state=27) # reproducible results

  # combine majority and upsampled minority
  upsampled = pd.concat([not_HIGH, high_upsampled])

  #Recreate TRAIN_Corpus and TRAIN_LABELS
  lcorpus1=upsampled.ConstText.to_list()
  llabels1=upsampled.RISK.to_list()

  # check new class counts
  print( upsampled.RISK.value_counts() )

  return lcorpus1, llabels1