# 1.Importing libraries

In [1]:
import torch
import pandas as pd 
import random 
import time
import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np 
from torch.utils.data import TensorDataset,Subset
from transformers import BertTokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.nn import functional as F

# 2.Enabling Gpu 


In [2]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


# 3.Organising Train Data

In [3]:
# Load the BERT tokenizer.
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




In [4]:
data = pd.read_csv('../input/datasetfinal/dataset_ideacao_final.csv')

In [5]:
class preprocess_nlp(object):
    
    def __init__(self, texts, stopwords=False , lemma=False, stem=True, wordcloud=False):#, numeric='tfidf'):
        
        self.texts = texts
        self.stopwords = stopwords
        self.lemma = lemma
        self.stem = stem
        self.wordcloud = wordcloud
        #self.numeric = numeric
        self.new_texts = None
        self.stopwords_list = list()
        
    def clean_text(self):

        new_texts = list()

        for text in self.texts:

            text = text.lower()
            text = re.sub('@[^\s]+', '', text)
            text = unidecode(text)
            text = re.sub('<[^<]+?>','', text)
            text = ''.join(c for c in text if not c.isdigit())
            text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', text)
            text = ''.join(c for c in text if c not in punctuation)
            new_texts.append(text)
        
        self.new_texts = new_texts

    def create_stopwords(self):
        
        stop_words = list(set(stopwords.words('portuguese') + list(STOP_WORDS)))
        
        for word in stop_words:

            self.stopwords_list.append(unidecode(word))
       
    
    def add_stopword(self, word):
        
        self.stopwords_list += [word]
        

    def remove_stopwords(self):

        new_texts = list()

        for text in self.new_texts:

            new_text = ''

            for word in word_tokenize(text):

                if word.lower() not in self.stopwords_list:

                    new_text += ' ' + word

            new_texts.append(new_text)

        self.new_texts = new_texts


    def extract_lemma(self):
        
        nlp = spacy.load("pt")
        new_texts = list()

        for text in self.texts:

            new_text = ''

            for word in nlp(text):

                new_text += ' ' + word.lemma_

            new_texts.append(new_text)
        
        self.new_texts = new_texts
    

    def extract_stem(self):

        stemmer = nltk.stem.SnowballStemmer('portuguese')
        new_texts = list()

        for text in self.texts:

            new_text = ''

            for word in word_tokenize(text):

                new_text += ' ' + stemmer.stem(word)

            new_texts.append(new_text)

        self.new_texts = new_texts
    

    def word_cloud(self):

        all_words = ' '.join([text for text in self.new_texts])
        word_cloud = WordCloud(width= 800, height= 500,
                               max_font_size = 110, background_color="white",
                               collocations = False).generate(all_words)
        plt.figure(figsize=(20,10))
        plt.imshow(word_cloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()
        wordcloud.to_file("wordcloud.png")
        

    def countvectorizer(self):

        vect = CountVectorizer()
        text_vect = vect.fit_transform(self.new_texts)

        return text_vect
    

    def tfidfvectorizer(self):

        vect = TfidfVectorizer(max_features=50)
        text_vect = vect.fit_transform(self.new_texts)

        return text_vect
    
    
    def preprocess(self):

        self.clean_text()
        
        if self.stopwords == True:
            self.create_stopwords()
            self.remove_stopwords()
            
        #if self.lemma == True:
            #self.extract_lemma()
        
        #if self.stem == True:
            #self.extract_stem() 
        
        #if self.wordcloud == True:
            #self.word_cloud()
        
        #if self.numeric == 'tfidf':
            #text_vect = self.tfidfvectorizer()
        #elif self.numeric == 'count':
            #text_vect = self.countvectorizer()
        #else:
            #print('metodo nao mapeado!')
            #exit()
            
        return self.new_texts

In [6]:
import re
from unidecode import unidecode
from string import punctuation
from spacy.lang.pt.stop_words import STOP_WORDS
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stoplist = stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
# Pré processamento
prepro = preprocess_nlp(data['text'])
data['text'] = prepro.preprocess()

In [8]:
data

Unnamed: 0,text,target
0,aquela vontade de acabar com a minha vida voltou,1
1,to triste e com vontade de acabar com a minha ...,1
2,corinthians ta querendo acabar com minha vida ...,0
3,alguem poderia por favor me dar um tiro a acab...,1
4,taylor tu vai acabar com a minha vida mulher,0
...,...,...
3783,ai tu quer me matar kkkkk,0
3784,lembra que eu falei que hoje ia ser bom mais u...,1
3785,oi vou me matar em breve,1
3786,vou tocar quarta pra eu n me matar pq nur,0


In [9]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

In [10]:
sentences = train.text.values
labels = train.target.values

In [11]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
  
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  se eu nao chegar do medico a tempo de ver a champions eu vou me matar
Token IDs: tensor([  101, 10126, 14444, 10132, 10133, 53319, 10149, 48091,   169, 12238,
        10104, 16719,   169, 27745, 14444, 12556, 10138, 10911, 42655,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [12]:
batch_size = 64 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
#helper function to get train and val data loaders for each fold 
def get_data_loaders(dataset,train_indexes,val_indexes):
    train_tensor = Subset(dataset,train_indexes)
    val_tensor = Subset(dataset,val_indexes)
    train_dataloader = DataLoader(
            train_tensor, 
            sampler = RandomSampler(train_tensor), 
            batch_size = batch_size
        )

    val_dataloader = DataLoader(
            val_tensor, 
            sampler = SequentialSampler(val_tensor), 
            batch_size = batch_size 
        )
    return train_dataloader,val_dataloader


In [13]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# 4. Organising test data for predictions

In [14]:
#df = pd.read_csv("../input/nlp-getting-started/test.csv")
df = test
sentences = df.text.values
input_ids = []
attention_masks = []
for sent in sentences:

    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# Set the batch size.  
batch_size = 16  
# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

# 5. Training Loop

In [15]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
def get_bert_model():
    model = BertForSequenceClassification.from_pretrained(
      "bert-base-multilingual-cased", 
      num_labels = 2,           
      output_attentions = False, 
      output_hidden_states = False, 
    )
    # Tell pytorch to run this model on the GPU.
    model.cuda()
    return model

In [16]:
import numpy as np
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [17]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [18]:
# Set the seed value all over the place to make this reproducible.
seed_val = 1000
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [19]:
total_folds = 5
current_fold = 0
epochs = 2
all_folds_preds = []
fold=StratifiedKFold(n_splits=total_folds, shuffle=True, random_state=1000)

training_stats = []

In [20]:
# Measure the total training time for the whole run.
total_t0 = time.time()
#for each fold..
for train_index, test_index in fold.split(train,train['target']):
    model = get_bert_model()
    optimizer = AdamW(model.parameters(),lr = 3e-5,eps = 1e-8)
    current_fold = current_fold+1
    train_dataloader,validation_dataloader = get_data_loaders(dataset,train_index,test_index)
    print("")
    print('================= Fold {:} / {:} ================='.format(current_fold,total_folds))
    # For each epoch...
    for epoch_i in range(0, epochs):
        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0
        model.train()
        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()        

            loss, logits = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)


            total_train_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            #update weights
            optimizer.step()


        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)            

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        total_f1_score = 0
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0
        total_precision_score = 0
        total_recall_score = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:


            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():        
                (loss, logits) = model(b_input_ids, 
                                        token_type_ids=None, 
                                        attention_mask=b_input_mask,
                                        labels=b_labels)

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            total_eval_accuracy += flat_accuracy(logits, label_ids)
            total_f1_score += f1_score(np.argmax(logits,axis=1),label_ids)
            total_precision_score += precision_score(np.argmax(logits,axis=1),label_ids)
            total_recall_score += recall_score(np.argmax(logits,axis=1),label_ids)

        # Report the final accuracy and f1_score for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
        
        avg_precision_score = total_precision_score / len(validation_dataloader)
        print("  Precision_score: {0:.2f}".format(avg_precision_score))
        
        avg_recall_score = total_recall_score / len(validation_dataloader)
        print("  Recall_score: {0:.2f}".format(avg_recall_score))
        
        avg_f1_score = total_f1_score / len(validation_dataloader)
        print("  F1_score: {0:.2f}".format(avg_f1_score))

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        # Record all statistics from this epoch.
        training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Valid. Accur.': avg_val_accuracy,
              'Precision_score': avg_precision_score,
              'Recall_score': avg_recall_score,
              'f1_score' : avg_f1_score,
              'Training Time': training_time,
              'Validation Time': validation_time,
              'fold' : current_fold
              
          }
        )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

    # ========================================
    # Predicting and saving predictions for all folds
    # ========================================

    print("")
    print("now predicting for this fold")

    # Put model in evaluation mode
    model.eval()
    # Tracking variables 
    predictions  = []
    # Predict 
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask = batch
        # speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None, 
                            attention_mask=b_input_mask)

        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()


        predictions.append(logits)

    stack = np.vstack(predictions)
    final_preds = F.softmax(torch.from_numpy(stack))[:,1].numpy()
    all_folds_preds.append(final_preds)
print("Completed")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




Training...

  Average training loss: 0.35
  Training epcoh took: 0:00:23

Running Validation...
  Accuracy: 0.88
  Precision_score: 0.89
  Recall_score: 0.74
  F1_score: 0.79
  Validation Loss: 0.35
  Validation took: 0:00:01

Training...

  Average training loss: 0.24
  Training epcoh took: 0:00:22

Running Validation...
  Accuracy: 0.91
  Precision_score: 0.79
  Recall_score: 0.88
  F1_score: 0.81
  Validation Loss: 0.28
  Validation took: 0:00:01

Training complete!
Total training took 0:01:28 (h:mm:ss)

now predicting for this fold






Training...

  Average training loss: 0.36
  Training epcoh took: 0:00:22

Running Validation...
  Accuracy: 0.91
  Precision_score: 0.81
  Recall_score: 0.89
  F1_score: 0.83
  Validation Loss: 0.25
  Validation took: 0:00:01

Training...

  Average training loss: 0.23
  Training epcoh took: 0:00:22

Running Validation...


  _warn_prf(average, modifier, msg_start, len(result))


  Accuracy: 0.89
  Precision_score: 0.66
  Recall_score: 0.96
  F1_score: 0.77
  Validation Loss: 0.31
  Validation took: 0:00:01

Training complete!
Total training took 0:02:22 (h:mm:ss)

now predicting for this fold


Training...

  Average training loss: 0.37
  Training epcoh took: 0:00:22

Running Validation...
  Accuracy: 0.88
  Precision_score: 0.88
  Recall_score: 0.74
  F1_score: 0.79
  Validation Loss: 0.33
  Validation took: 0:00:01

Training...

  Average training loss: 0.24
  Training epcoh took: 0:00:22

Running Validation...
  Accuracy: 0.92
  Precision_score: 0.84
  Recall_score: 0.85
  F1_score: 0.83
  Validation Loss: 0.27
  Validation took: 0:00:01

Training complete!
Total training took 0:03:16 (h:mm:ss)

now predicting for this fold


Training...

  Average training loss: 0.40
  Training epcoh took: 0:00:22

Running Validation...
  Accuracy: 0.89
  Precision_score: 0.91
  Recall_score: 0.77
  F1_score: 0.81
  Validation Loss: 0.35
  Validation took: 0:00:01

Trainin

In [21]:
pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('fold')
df_stats

Unnamed: 0_level_0,epoch,Training Loss,Valid. Loss,Valid. Accur.,Precision_score,Recall_score,f1_score,Training Time,Validation Time
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,0.35,0.35,0.88,0.89,0.74,0.79,0:00:23,0:00:01
1,2,0.24,0.28,0.91,0.79,0.88,0.81,0:00:22,0:00:01
2,1,0.36,0.25,0.91,0.81,0.89,0.83,0:00:22,0:00:01
2,2,0.23,0.31,0.89,0.66,0.96,0.77,0:00:22,0:00:01
3,1,0.37,0.33,0.88,0.88,0.74,0.79,0:00:22,0:00:01
3,2,0.24,0.27,0.92,0.84,0.85,0.83,0:00:22,0:00:01
4,1,0.4,0.35,0.89,0.91,0.77,0.81,0:00:22,0:00:01
4,2,0.25,0.37,0.9,0.87,0.79,0.82,0:00:22,0:00:01
5,1,0.37,0.27,0.9,0.86,0.82,0.83,0:00:22,0:00:01
5,2,0.25,0.26,0.9,0.73,0.93,0.8,0:00:23,0:00:01


# 6. Setting File Submission


In [22]:
to_submit =np.mean(all_folds_preds,0)

In [23]:
sub=pd.DataFrame()
sub['text'] = df['text']
sub['target'] = to_submit
sub['target'] = sub['target'].apply(lambda x: 1 if x>0.5 else 0)
sub.head()

Unnamed: 0,text,target
185,para de fingir q ta melhor sem mim besta e dei...,0
2787,quer me matar,0
1424,quero morrer,1
1932,ta querendo de tristeza me matar a essa hora,0
2817,so um video do maicon e do kotaka pra me alegr...,0


In [24]:
sub.to_csv('bert_base_12_2e-5_64.csv',index=False)