<a href="https://colab.research.google.com/github/eduartheinen/foursquare-tips/blob/master/foursquare_tips.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#TODO: 
1.   **GridSearch features e params**
2.   **BERT**
3.   **Bi-LSTM**
3.   **Visualization of model/config results**
4.   **ROC**
5.   **LIME**
6.   **Explain each model**

In [1]:
!pip3 install -U spacy setuptools wheel xgboost # transformers
!python -m spacy download pt_core_news_sm # comment this line after first run

Requirement already up-to-date: spacy in /usr/local/lib/python3.7/dist-packages (3.0.3)
Requirement already up-to-date: setuptools in /usr/local/lib/python3.7/dist-packages (53.1.0)
Requirement already up-to-date: wheel in /usr/local/lib/python3.7/dist-packages (0.36.2)
Requirement already up-to-date: xgboost in /usr/local/lib/python3.7/dist-packages (1.3.3)
2021-02-25 19:04:20.000173: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [2]:
# uncomment only if using BERT
# !wget https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/bert-base-portuguese-cased_pytorch_checkpoint.zip
# !wget https://neuralmind-ai.s3.us-east-2.amazonaws.com/nlp/bert-base-portuguese-cased/vocab.txt -P bert_checkpoint/
# !unzip bert-base-portuguese-cased_pytorch_checkpoint.zip -d bert_checkpoint/


In [3]:
import re
import string
import spacy
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold

# pytorch
# from torch.utils.data import Dataset, DataLoader


#Data Preprocessing

In [160]:
class FoursquareTipsDataset():
    def __init__(self, df, ngram_range=None):
      # extracting lemmas and POS tags with spacy even though we are not using them yet
      self.sentences, self.terms, self.lemmas, self.pos = self.preprocess(df.texto)
      self.labels = df.rotulo.reset_index(drop=True)
      self.feature_type_ = 'bow'

      # bag of words
      self.count_vectorizer = CountVectorizer(ngram_range=ngram_range)
      self.bow = self.count_vectorizer.fit_transform(self.sentences)

      # tfidf
      self.tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_df=0.99)
      self.tfidf = self.tfidf_vectorizer.fit_transform(self.sentences)

      # SVD/LSA
      print('fitting bow_lsa')
      self.svd_bow = self.fit_svd(self.bow)
      print('fitting tfidf_lsa')
      self.svd_tfidf = self.fit_svd(self.tfidf)

      # for easy indexing
      self.sentences = pd.DataFrame(self.sentences)
      self.lemmas = pd.DataFrame(self.lemmas)
      self.pos = pd.DataFrame(self.pos)

    def feature_type(self, feature_type):
      self.feature_type_ = feature_type

    @staticmethod
    def preprocess(reviews):
        sentences = []
        lemmas = []
        pos = []
        terms = []

        for sentence in tqdm(reviews):
            sentence = re.sub(r'http\S+', '', sentence)  # removes urls before punctuation
            punctuation_to_space = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
            sentence = sentence.translate(punctuation_to_space)  # change punctuations to spaces
            sentence = str.lower(sentence)
            sentence = re.sub('\d+', '', sentence)  # removes numbers
            sentence = re.sub(' +', ' ', sentence)  # removes double spaces

            # spacy processing -- nlp(sentence) -- adds properties to words,
            # like "lemma_", "pos_" and "is_stop" for stop_words.
            sentence = list(filter(lambda w: not w.is_stop, nlp(sentence)))
            lemmas.append([w.lemma_ for w in sentence if not w.is_stop])
            pos.append([w.pos_ for w in sentence if not w.is_stop])
            terms.append(sentence)

            # sklearn count/tfidf vectorizers require raw text
            sentences.append(' '.join([w.text for w in sentence]))

        return sentences, terms, lemmas, pos

    def fit_svd(self, data):
      for c in range(1400, 2000, 100):
        svd = TruncatedSVD(n_components=c, n_iter=10)
        svd.fit(data)
        if svd.explained_variance_ratio_.sum() > 0.98:
          print(f'{c} components explained {svd.explained_variance_ratio_.sum():.4f} of feature variance.')
          return svd.fit_transform(data)

    def __getitem__(self, i):
      if self.feature_type_ == 'svd_tfidf':
        return self.svd_tfidf[i], self.labels.iloc[i]

      if self.feature_type_ == 'svd_bow':
        return self.svd_bow[i], self.labels.iloc[i]

      if self.feature_type_ == 'tfidf':
        return self.tfidf[i].toarray(), self.labels.iloc[i]

      return self.bow[i].toarray(), self.labels.iloc[i]

    def __len__(self):
        return len(self.sentences)

### Load and Process Dataset

In [161]:
nlp = spacy.load('pt_core_news_sm')
path = 'https://raw.githubusercontent.com/eduartheinen/foursquare-tips/master/data/'
df = pd.read_csv(path + 'tips_scenario2_train.csv').dropna(how='any')
data = FoursquareTipsDataset(df, ngram_range=(1, 2))

100%|██████████| 1788/1788 [00:18<00:00, 97.59it/s]


fitting bow_lsa
1600 components explained 0.9912 of feature variance.
fitting tfidf_lsa
1700 components explained 0.9875 of feature variance.


### New Possibilities

In [7]:
# Repositório de Word-Embeddings em Português do NILC-ICMC-USP http://www.nilc.icmc.usp.br/embeddings
# embeddings_path = "http://143.107.183.175:22980/download.php?file=embeddings/glove/glove_s100.zip"

# Bert treinado em Português
# https://github.com/neuralmind-ai/portuguese-bert

#Feature Engineering

In [149]:
import plotly.express as px


tmp = pd.DataFrame({'class': ['negativo', 'positivo'], 
                   'samples': [len(data.labels[data.labels==c]) for c in [-1, 1]]})
fig = px.bar(tmp, x='class', y='samples', color='class')
fig.show()

###*Class-Balanced Loss Based on Effective Number of Samples*

In [111]:
N = data.bow.shape[0] * data.bow.shape[1]
beta = (N - 1) / N
# num_classes = 3
num_classes = 2
samples_per_class = [len(data.labels[data.labels == -1]), \
                    #  len(data.labels[data.labels == 0]), \
                     len(data.labels[data.labels == 1])]

effective_num = 1.0 - np.power(beta, samples_per_class)
weights = (1.0 - beta) / np.array(effective_num)
weights = weights / np.sum(weights) * num_classes

print(samples_per_class)
print(weights)

class_weights = {-1:weights[0], 0:1, 1:weights[1]}
class_weights

[414, 1161]
[1.47428118 0.52571882]


{-1: 1.4742811750507787, 0: 1, 1: 0.5257188249492214}

In [159]:
kf = KFold(n_splits=10)
kf.get_n_splits(data.bow)

(1788, 19684)

# Naive Bayes

In [None]:
from sklearn.naive_bayes import ComplementNB
features = ['bow', 'tfidf']
scores = {f:[] for f in features}

for train_index, test_index in kf.split(data.bow):
  for f in features:
    data.feature_type(f)
    x_train, y_train = data[train_index]
    x_test, y_test = data[test_index]
    
    cnb = ComplementNB()
    cnb.fit(x_train, y_train)
    scores[f].append(cnb.score(x_test, y_test))

cnb_scores = scores
nb_bow = np.mean(scores['bow'])
nb_tfidf = np.mean(scores['tfidf'])
print(f'Naive Bayes: bow {nb_bow:.4f}, tfidf {nb_tfidf:.4f}')

#Logistic Regression

In [150]:
from sklearn.linear_model import LogisticRegression

features = ['bow', 'svd_bow', 'tfidf', 'svd_tfidf']

scores = {f:[] for f in features}
# scores.update({f+'_weighted':[] for f in features})

train_index = test_index = int(len(data)*0.75) # testing with 0.15 of dataset
# for train_index, test_index in kf.split(data.bow):
for f in features:
  data.feature_type(f)
  x_train, y_train = data[:train_index]
  x_test, y_test = data[test_index:]

  lr_cv = LogisticRegression(C=1e3, n_jobs=-1, max_iter=500)
  lr_cv.fit(x_train, y_train)
  scores[f].append(lr_cv.score(x_test, y_test))

  # lr_cv = LogisticRegression(C=1, n_jobs=-1, max_iter=500, 
  #                             class_weight=class_weights)
  # lr_cv.fit(x_train, y_train)
  # scores[f+'_weighted'].append(lr_cv.score(x_test, y_test))

lr_scores = scores
scores

KeyboardInterrupt: ignored

#Support Vector Classification

In [158]:
from sklearn.svm import LinearSVC

features = ['bow', 'svd_bow', 'tfidf', 'svd_tfidf']

scores = {f:[] for f in features}
# scores.update({f+'_weighted':[] for f in features})

# train_index = test_index = int(len(data)*0.75) # testing with 0.15 of dataset
for train_index, test_index in kf.split(data.bow):
  for f in features:
    data.feature_type(f)
    x_train, y_train = data[train_index]
    x_test, y_test = data[test_index]

    scv = LinearSVC(C=1)
    scv.fit(x_train, y_train)
    scores[f].append(scv.score(x_test, y_test))

    # scv = LinearSVC(C=1.0, class_weight=class_weights)
    # scv.fit(x_train, y_train)
    # scores[f+'_weighted'].append(scv.score(x_test, y_test))

svc_scores = scores

for f in features:
  tmp = np.mean(scores[f])
  print(f'{f}:{tmp}')

bow:0.7265049274998431
svd_bow:0.6839997489172054
tfidf:0.7131473228297031
svd_tfidf:0.710906408888331


#XGBoost

In [155]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score


features = ['bow', 'svd_bow', 'tfidf', 'svd_tfidf']

scores = {f:[] for f in features}
scores.update({f+'_weighted':[] for f in features})

train_index = test_index = int(len(data)*0.75) # testing with 0.15 of dataset
# for train_index, test_index in kf.split(data.bow):
for f in features:
  data.feature_type(f)
  x_train, y_train = data[:train_index]
  x_test, y_test = data[test_index:]

  xgb = XGBClassifier()
  xgb.fit(x_train, y_train)
  scores[f].append(xgb.score(x_test, y_test))







KeyboardInterrupt: ignored

#Bi-directional LSTM

In [164]:
import torch
import torch.nn as nn


class BiLSTM(nn.Module):
    def __init__(self, tag_to_ix, input_dim, hidden_dim, batch_size, num_classes):
        super(BiLSTM_CRF, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.num_classes = num_classes

        # GloVe, but contains only vectors that refer to words on the dataset vocabulary
        # self.word_embeds = nn.Embedding.from_pretrained(embedding_vectors)
        self.lstm = nn.LSTM(input_dim, hidden_dim // 2, num_layers=1,
                            bidirectional=True, batch_first=True)

        # Maps the output of the LSTM into label space.
        self.hidden2label = nn.Linear(hidden_dim, self.num_classes)
        self.hidden = self.init_hidden()

    def init_hidden(self, batch_size=None):
        batch_size = self.batch_size if batch_size is None else batch_size

        return (torch.nn.init.xavier_uniform_(torch.zeros(2, batch_size, self.hidden_dim // 2)).to(device),
                torch.nn.init.xavier_uniform_(torch.zeros(2, batch_size, self.hidden_dim // 2)).to(device))

    def forward(self, sentence):
        self.hidden = self.init_hidden()
        lstm_out, _ = self.lstm(sentence, self.hidden)
        
        lstm_feats = self.hidden2label(unpacked_lstm_out)
        labels = F.log_softmax(lstm_feats)

        return labels

In [None]:
def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])  # inputs = pack_padded_sequence(batch)
    # inputs = pack_padded_sequence(inputs, [len(r[r != 0]) for r in inputs], batch_first=True, enforce_sorted=False)

    target = torch.LongTensor([item[1] for item in batch])  # .view(-1)
    # target = pack_padded_sequence(target, [len(r[r != 0]) for r in target], batch_first=True, enforce_sorted=False)

    text = [item[2] for item in batch]
    return inputs, target, text


train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate)

In [165]:
def training_epoch(model, optimizer, train_loader):  # criterion, scheduler
    model.train()
    losses = []
    progress_bar = tqdm(train_loader, desc='Training', leave=False)
    for inputs, target, text in progress_bar:
        inputs = inputs.to(device)
        target = target.to(device)

        # Clean old gradients
        # optimizer.zero_grad()
        model.zero_grad()

        # Forwards pass
        # loss = model.neg_log_likelihood_parallel(inputs, target)
        loss = model.neg_log_likelihood(inputs, target)

        # acc = accuracy_score(model, output)

        # Perform gradient descent, backwards pass
        loss.backward()

        # Take a step in the right direction
        optimizer.step()
        # scheduler.step()

        losses.append(loss.item())

    # plot_losses(losses, 'batch #', 'neg_log_likelihood_loss', 'Training Loss')
    return sum(losses) / len(losses)


def validate_epoch(model, valid_loader):  # criterion, scheduler
    model.eval()
    mean_F = []

    with torch.no_grad():
        progress_bar = tqdm(valid_loader, desc='Validating', leave=False)
        for inputs, target, text in progress_bar:
            inputs = inputs.to(device)
            target = target.to(device)

            # Forwards pass
            score, best_path = model(inputs)

            # Calculating the F-Score
            positive = [i for i, t in enumerate(target.view(-1)) if t != 0]
            predictions = (target - best_path).view(-1)
            true_positive = [i for i, t in enumerate(predictions[positive]) if t == 0]

            if len(true_positive) > 0:
                p = len(true_positive) / len(predictions)
                r = len(true_positive) / len(positive)
                mean_F.append(2 * p * r / (p + r))

    # mean_F = np.mean(mean_F)
    # return mean_F if mean_F > best_F else best_F
    return np.mean(mean_F)
    

In [None]:
model = BiLSTM_CRF(vocab_size=len(train_data.vocab.stoi),
                   tag_to_ix=train_data.entity_tags.stoi, embedding_vectors=train_data.vocab.vectors,
                   embedding_dim=embedding_dim, hidden_dim=hidden_size, batch_size=batch_size).to(device)

optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                        lr=learning_rate, weight_decay=weight_decay)

loss_file = os.path.join(path, 'losses.csv')
train_losses = []
best_Fs = []
best_F = 0
epoch = 0

# resume training
checkpoint_file = os.path.join(path, 'ner-bilstm-crf-feb20.pth')

# check if saved model exists
resume_training = False
if os.path.isfile(checkpoint_file) and resume_training:
    checkpoint = torch.load(checkpoint_file)

    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    epoch = checkpoint['epoch']
    train_losses = checkpoint['train_losses']
    best_Fs = checkpoint['best_Fs']
    best_F = np.max(best_Fs)

    plot_losses(train_losses, 'training epoch #', 'neg_log_likelihood_loss', 'Training Loss')
    plot_losses(best_Fs, 'validation epoch #', 'F score', 'F score')

In [None]:
# training epochs
for epoch in range(max_epochs):
    train_loss = training_epoch(model, optimizer, train_loader)  # train_loader
    new_F = validate_epoch(model, valid_loader)

    if new_F > best_F:
        best_F = new_F

    if epoch % 10 == 0:
      torch.save({
          'epoch': epoch,
          'model_state_dict': model.state_dict(),
          'optimizer_state_dict': optimizer.state_dict(),
          'train_losses': train_losses,
          'best_Fs': best_Fs
      }, checkpoint_file)
        

    # with open(loss_file, 'a', newline='\n') as csvfile:
    #     writer = csv.writer(csvfile)
    #     writer.writerow([epoch + 1, train_loss, new_F])

    tqdm.write(
        f'epoch #{epoch + 1:3d}\ttrain_loss: {train_loss:.6f}\tcurrent_F: {new_F:.6f}\tbest_F: {best_F:.6f} \n',
    )

    train_losses.append(train_loss)
    best_Fs.append(new_F)

    epoch += 1

plot_losses(train_losses, 'training epoch #', 'neg_log_likelihood_loss', 'Training Loss')
plot_losses(best_Fs, 'validation epoch #', 'F score', 'F score')

In [None]:
checkpoint_file = os.path.join(path, 'ner-bilstm-crf-feb18.pth')
model.eval()

torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_losses': train_losses,
            'best_Fs': best_Fs
        }, checkpoint_file)

In [None]:
from IPython.display import HTML
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=list(range(len(train_losses))), y=train_losses, name="nlll"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=list(range(len(best_Fs))), y=best_Fs, name="f-score"),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Training Loss and Validation F-score"
)

# Set x-axis title
fig.update_xaxes(title_text="train-validation epochs")

# Set y-axes titles
fig.update_yaxes(title_text="<b>neg-log-likelihood-loss</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>f-score</b>", secondary_y=True)

fig.show()