In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss
from torchsummary import summary
from torch.optim import Adam
from torch import cuda

import torch.nn as nn
import torch

import pandas as pd
import numpy as np

import joblib
import nltk

import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_dir = "path/to/data"
train_prefix = "train-prefix"

eval_dir = "path/to/data"
eval_prefix = "eval-prefix"

results_dir = "path/to/results"

train_dataset = pd.read_csv(f'{train_dir}/{train_prefix}-train-features.csv')
dev_dataset = pd.read_csv(f'{eval_dir}/{eval_prefix}-dev-features.csv')
test_dataset = pd.read_csv(f'{eval_dir}/{eval_prefix}-test-features.csv')

### Data Preparation

In [None]:
train_feature_based = train_dataset.drop(columns=['label','text','POS-tagged','seq_len'])
train_feature_based.head()

In [None]:
print(train_feature_based.corr(numeric_only=True)['label_bool'].sort_values(ascending=False)[1:])

In [None]:
dev_feature_based = dev_dataset.drop(columns=['label','text','POS-tagged','seq_len'])
print(dev_feature_based.corr(numeric_only=True)['label_bool'].sort_values(ascending=False)[1:])
dev_feature_based.head()

In [None]:
test_feature_based = test_dataset.drop(columns=['label','text','POS-tagged','seq_len'])
print(test_feature_based.corr(numeric_only=True)['label_bool'].sort_values(ascending=False)[1:])
test_feature_based.head()

### Model Training

In [None]:
def train_and_evaluate(input, labels, model, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(input, labels, test_size=0.2, random_state=random_state)
    # print number of test samples per class
    # print("Number of test samples per class: ", np.bincount(y_train))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), confusion_matrix(y_test, y_pred)

In [None]:
def train_and_evaluate_split(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), confusion_matrix(y_test, y_pred)

#### Multinomial Naive Bayes

In [None]:
method = "lr"
classifier = MultinomialNB() if method == "mb" else LogisticRegression()
evaluation = "test"

test = dev_feature_based if evaluation == "dev" else test_feature_based

X_train, X_test, y_train, y_test = train_feature_based.drop(columns=['label_bool']), test.drop(columns=['label_bool']), train_feature_based['label_bool'], test['label_bool']

metrics = train_and_evaluate_split(X_train, X_test, y_train, y_test, classifier)

In [None]:
print('Accuracy: ', metrics[0])
print('Precision: ', metrics[1])
print('Recall: ', metrics[2])
print('F1: ', metrics[3])
print('Confusion Matrix: ', metrics[4])

In [None]:
# print 5 missclassified samples
X_train, X_test, y_train, y_test = train_test_split(feature_based.drop(columns=['label_bool']), dataset['label_bool'], test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
missclassified = np.where(y_test != y_pred)[0][:10]
# print the missclassified samples
for i in missclassified:
    print('Text: ', dataset['text'][i])
    print('Label: ', dataset['label'][i])
    print('------------------------')

#### N-Gram Model

In [None]:
filtered_dataset = dataset.copy()
filtered_dataset.text = filtered_dataset.text.apply(lambda x: x.lower())
filtered_dataset.text = filtered_dataset.text.apply(lambda x: re.sub(r'\([^)]*\)', '', x))
filtered_dataset.text = filtered_dataset.text.apply(lambda x: re.sub(r'[^\w\s]', '', x))
filtered_dataset.head()

In [None]:
def get_ngrams(train_df, eval_df, ngram_range=(1, 6), max_features=10000, show=False):
    ngrams = []

    for n in range(1, ngram_range[1] + 1):
        vectorizer = CountVectorizer(ngram_range=(1, n), max_features=max_features)
        train_ngram = vectorizer.fit_transform(train_df['text'])
        eval_ngram = vectorizer.transform(eval_df['text'])
        if show:
            print(f"{n}-grams train shape:", train_ngram.shape)
        ngrams.append((train_ngram, eval_ngram))

    return ngrams

def get_ngrams_results(ngrams, train_labels, eval_labels, show=False, max_features=10000, classifier=MultinomialNB()):
    results = []
    for i in range(len(ngrams)):
        metrics = train_and_evaluate_split(ngrams[i][0], ngrams[i][1], train_labels, eval_labels, classifier)
        metrics = [metrics[0], metrics[1], metrics[2], metrics[3], metrics[4][0][0], metrics[4][0][1], metrics[4][1][0], metrics[4][1][1], i+1]
        results.append(metrics)

    return pd.DataFrame(results, columns=['accuracy', 'precision', 'recall', 'f1', 'true_positive', 'false_positive', 'false_negative', 'true_negative', 'max_ngram'])

In [None]:
method = "mb"
classifier = MultinomialNB() if method == "mb" else LogisticRegression()
evaluation = "test"
eval_dataset = dev_dataset if evaluation == "dev" else test_dataset

In [None]:
ngrams = get_ngrams(train_dataset, eval_dataset, show=True)

In [None]:
df = get_ngrams_results(ngrams, train_dataset['label_bool'], eval_dataset['label_bool'], show=True, max_features=10000, classifier=classifier)
df.head(10)

In [None]:
df.to_csv(f'{results_dir}/{train_prefix}-{eval_prefix}-{evaluation}-ngram-{method}-results.csv', index=False)

#### N-Gram Model with POS Tagging

In [None]:
pt_pos_tagger = joblib.load('/content/drive/MyDrive/PTvsBR/POS_tagger_brill.pkl')

def tag_sentence(sentence):
    sentence = sentence.lower()
    sentence = nltk.word_tokenize(sentence, language='portuguese')
    return pt_pos_tagger.tag(sentence)

In [None]:
train_pos_tagged = train_dataset.copy()
train_pos_tagged['text'] = train_pos_tagged['POS-tagged'].apply(lambda x: ' '.join(x.split("@@@")))
train_pos_tagged.head()

In [None]:
dev_pos_tagged = dev_dataset.copy()
dev_pos_tagged['text'] = dev_pos_tagged['POS-tagged'].apply(lambda x: ' '.join(x.split("@@@")))
dev_pos_tagged.head()

In [None]:
test_pos_tagged = test_dataset.copy()
test_pos_tagged['text'] = test_pos_tagged['POS-tagged'].apply(lambda x: ' '.join(x.split("@@@")))
test_pos_tagged.head()

In [None]:
method = "mb"
classifier = MultinomialNB() if method == "mb" else LogisticRegression()
evaluation = "test"
eval_pos_tagged = dev_pos_tagged if evaluation == "dev" else test_pos_tagged

In [None]:
pos_tagged_ngrams = get_ngrams(train_pos_tagged, eval_pos_tagged, show=True)

In [None]:
df = get_ngrams_results(pos_tagged_ngrams, train_pos_tagged['label_bool'], eval_pos_tagged['label_bool'], show=True, max_features=10000, classifier=classifier)
df.head(10)

In [None]:
df.to_csv(f'{results_dir}/{train_prefix}-{eval_prefix}-{evaluation}-pos-ngram-{method}-results.csv', index=False)


#### Adaptive Version

In [None]:
ADAPTIVE_TRAIN_ITERATIONS = 4

def adaptive_train(X_train, y_train, model, size, show=False):
  model.partial_fit(X_train[:size], y_train[:size], classes=[0, 1])
  # Iterate over subsets using groupby
  a = size // 10
  for i in range(1, len(y_train) // size):
    if show:
      print(f"Training subset {i} ({i*size}:{(i+1)*size}/{len(y_train)})...")
    X_subset, y_subset = list(X_train[i*size:(i+1)*size].toarray()), list(y_train[i*size:(i+1)*size])
    for iteration in range(ADAPTIVE_TRAIN_ITERATIONS):
      if len(y_subset) == 0:
        if show:
          print("All elements processed")
        break
      predictions = model.predict_proba(X_subset)
      indexes = top_indexes(predictions, lambda x: abs(x[0] - x[1]), a)
      if len(indexes) == 0:
        if show:
          print("Not enough confidence.")
        break
      X_removed = [X_subset.pop(index) for index in reversed(indexes)]
      y_removed = [y_subset.pop(index) for index in reversed(indexes)]
      model.partial_fit(X_removed, y_removed)
  return model


def top_indexes(subset, criteria, n):
    subset = [(i, item) for i, item in enumerate(subset)]
    subset = sorted(subset, key=lambda x: criteria(x[1]), reverse=True)
    return list(sorted([i for i, _ in subset[:n]]))


def evaluate(X_test, y_test, model):
  y_pred = model.predict(X_test)
  return accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), confusion_matrix(y_test, y_pred)

def get_model(ngrams, y, size=512, model_key="mb", show=False):
  models = []
  for i in range(len(ngrams)):
    if show:
      print(f"Training ngrams with max size {i+1}")
    new_model = adaptive_train(ngrams[i][0], y, MultinomialNB() if model_key=="mb" else LogisticRegression(), size, show=show)
    models.append(new_model)

  return models


def adaptive_results(ngrams, y, models):
  results = []
  for i in range(len(ngrams)):
    metrics = evaluate(ngrams[i][1], y, models[i])
    metrics = [metrics[0], metrics[1], metrics[2], metrics[3], metrics[4][0][0], metrics[4][0][1], metrics[4][1][0], metrics[4][1][1], i+1]
    results.append(metrics)

  return pd.DataFrame(results, columns=['accuracy', 'precision', 'recall', 'f1', 'true_positive', 'false_positive', 'false_negative', 'true_negative', 'max_ngram'])

In [None]:
from sklearn.utils import shuffle

evaluation = "dev"
eval_dataset = dev_dataset if evaluation=="dev" else test_dataset
train_shuffled = shuffle(train_dataset.copy(), random_state=42)
train_shuffled.reset_index(inplace=True, drop=True)
ngrams = get_ngrams(train_shuffled, eval_dataset, show=True)

In [None]:
method = "mb"
size = 1024 * 16

saved_models = []

for i in range(10):
  models = get_model(ngrams, train_shuffled['label_bool'], size=size, show=False)
  saved_models.append(models)
  df = adaptive_results(ngrams, eval_dataset['label_bool'], models)
  df.to_csv(f'{results_dir}/{train_prefix}-{eval_prefix}-{evaluation}-adaptive-{size}-splits-ngram-{method}-results.csv', index=False)
  if size <= 16:
    break
  size = size // 2

#### Adaptive w/ POS Tagging

In [None]:
from sklearn.utils import shuffle

evaluation = "dev"
eval_pos_tagged = dev_pos_tagged if evaluation=="dev" else test_pos_tagged
train_shuffled = shuffle(train_pos_tagged.copy(), random_state=42)
train_shuffled.reset_index(inplace=True, drop=True)
pos_ngrams = get_ngrams(train_shuffled, eval_pos_tagged, show=True)

In [None]:
method = "mb"
size = 1024 * 16

for i in range(10):
  models = get_model(pos_ngrams, train_shuffled['label_bool'], size=size, show=False)
  saved_models.append(models)
  df = adaptive_results(pos_ngrams, eval_pos_tagged['label_bool'], models)
  df.to_csv(f'{results_dir}/{train_prefix}-{eval_prefix}-{evaluation}-pos-adaptive-{size}-splits-ngram-{method}-results.csv', index=False)
  if size <= 16:
    break
  size = size // 2

### N-Grams with Neural Networks

In [None]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.l1 = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU()
        )
        self.l2 = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU()
        )
        self.l3 = nn.Sequential(
            nn.Linear(hidden_size, num_classes),
            nn.Sigmoid()
        )

    def forward(self, x):
        out = self.l1(x)
        out = self.l2(out)
        out = self.l3(out)
        return out

In [None]:
model = Net(1000, 100, 1)
summary(model, (1000,))

In [None]:
learning_rate = 1e-3
batch_size = 256
criterion = nn.BCELoss()
epochs = 5
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, device='cpu'):
    train_loss_history = []
    val_loss_history = []
    for epoch in range(epochs):
        model.train()
        number_of_batches = 0
        train_loss_history.append(0)

        for X, y in train_loader:
            X = X.to(device)
            y = y.to(device).unsqueeze(1).to(torch.float32)
            optimizer.zero_grad()
            y_pred = model(X.float())
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            train_loss_history[-1] += loss.item()
            number_of_batches += 1

        train_loss_history[-1] /= number_of_batches

        model.eval()
        number_of_batches = 0
        val_loss_history.append(0)
        with torch.no_grad():
            for X, y in val_loader:
                X = X.to(device)
                y = y.to(device).unsqueeze(1).to(torch.float32)
                y_pred = model(X.float())
                loss = criterion(y_pred, y)
                val_loss_history[-1] += loss.item()
                number_of_batches += 1

        val_loss_history[-1] /= number_of_batches
        print('Epoch: {} - Train Loss: {:.6f} - Val Loss: {:.6f}'.format(epoch+1, train_loss_history[-1], val_loss_history[-1]))

    return model, train_loss_history, val_loss_history

In [None]:
train_datasets = []
val_datasets = []
untagged_ngrams = get_ngrams(dataset, show=True, max_features=10000)
for i in range(len(untagged_ngrams)):
    X_train, X_val, y_train, y_val = train_test_split(untagged_ngrams[i], dataset['label_bool'], test_size=0.2, random_state=42)
    train_datasets.append(TensorDataset(torch.from_numpy(X_train.toarray()).float(), torch.from_numpy(y_train.values).long()))
    val_datasets.append(TensorDataset(torch.from_numpy(X_val.toarray()).float(), torch.from_numpy(y_val.values).long()))

train_loaders = []
val_loaders = []
for i in range(len(train_datasets)):
    train_loaders.append(DataLoader(train_datasets[i], batch_size=batch_size, shuffle=True))
    val_loaders.append(DataLoader(val_datasets[i], batch_size=batch_size, shuffle=False))

print(train_loaders[0].dataset.tensors[0].shape)
print(val_loaders[0].dataset.tensors[0].shape)

In [None]:
models = []
train_losses = []
val_losses = []
for i in range(len(train_loaders)):
    print(f'Ngram {i+1}:')
    model = Net(train_loaders[i].dataset.tensors[0].shape[1], 10, 1).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    _, train_loss_history, val_loss_history = train_model(model, criterion, optimizer,
                                                          train_loaders[i], val_loaders[i], epochs=epochs, device=device)
    models.append(model)
    train_losses.append(train_loss_history)
    val_losses.append(val_loss_history)
    print()
    print("-"*100)
    print()

In [None]:
import matplotlib.pyplot as plt

for i in range(len(train_losses)):
    plt.plot(train_losses[i], label=f'ngram {i+1}')
    plt.plot(val_losses[i], label=f'ngram {i+1} val')
    plt.legend()
    plt.show()

In [None]:
# per model, run the val set and get accuracy, precision, recall, f1

def get_metrics(model, val_loader, device='cpu'):
    model.eval()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for X, y in val_loader:
            X = X.to(device)
            y = y.to(device).unsqueeze(1).to(torch.float32)
            y_pred.append(model(X.float()).squeeze(1).cpu().numpy())
            y_true.append(y.squeeze(1).cpu().numpy())
    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    y_pred = np.where(y_pred > 0.5, 1, 0)
    return accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred)

In [None]:
metrics = []
for i in range(len(val_loaders)):
    accuracy, precision, recall, f1 = get_metrics(models[i], val_loaders[i], device=device)
    metrics.append([accuracy, precision, recall, f1, i+1])
metrics_df = pd.DataFrame(metrics, columns=['accuracy', 'precision', 'recall', 'f1', 'ngram'])
metrics_df.head(10)

In [None]:
metrics_df.to_csv(f'{results_dir}/pos-ngram-nn-results.csv', index=False)

## Error Analysis

In [None]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=10000)
bigrams = bigram_vectorizer.fit_transform(pos_tagged['text'])
classifier = MultinomialNB()

X_train, X_test, y_train, y_test = train_test_split(bigrams, pos_tagged['label_bool'], test_size=0.2, random_state=42)
_, aux_X_test, _, _ = train_test_split(dataset['text'], dataset['label_bool'], test_size=0.2, random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
# get wrong predictions and print text
wrong = np.where(y_pred != y_test)[0]
label_to_text = lambda x: 'PT' if x == 1 else 'BR'
missclassified = []
for i in wrong:
    # print example from aux_X_test
    print(f"Predicted: {label_to_text(y_pred[i])} - True: {label_to_text(y_test.iloc[i])}")
    print(aux_X_test.iloc[i])
    print()
    missclassified.append([aux_X_test.iloc[i], y_pred[i], y_test.iloc[i]])

missclassified_df = pd.DataFrame(missclassified, columns=['text', 'predicted', 'true'])
missclassified_df.head(10, )