In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss
from torchsummary import summary
from torch.optim import Adam
from torch import cuda

import torch.nn as nn
import torch

import pandas as pd
import numpy as np

import joblib
import nltk

import re

import warnings
warnings.filterwarnings('ignore')

In [10]:
train_dir = "/content/drive/MyDrive/PTvsBR/data/full-length"
train_prefix = "tedtalks-1k-full-length"

eval_dir = "/content/drive/MyDrive/PTvsBR/data/1k-4sent"
eval_prefix = "tedtalks-1k-4sent"

results_dir = "full-length-4-iterations"

train_dataset = pd.read_csv(f'{train_dir}/{train_prefix}-train-features.csv')
dev_dataset = pd.read_csv(f'{eval_dir}/{eval_prefix}-dev-features.csv')
test_dataset = pd.read_csv(f'{eval_dir}/{eval_prefix}-test-features.csv')

### Data Preparation

In [11]:
train_feature_based = train_dataset.drop(columns=['label','text','POS-tagged','seq_len'])
train_feature_based.head()

Unnamed: 0,label_bool,pt_pt_second_person_hints,pt_pt_second_person_hints_bool,pt_br_second_person_hints,pt_br_second_person_hints_bool,pt_pt_pronoun_position_hints,pt_pt_pronoun_position_hints_bool,pt_br_pronoun_position_hints,pt_br_pronoun_position_hints_bool,gerund_count,...,a_plus_infinitive_count,a_plus_infinitive_count_bool,count_acute_accent,count_circumflex_accent,count_article_before_possessive_pronoun,count_article_before_possessive_pronoun_bool,count_portuguese_words,count_brazilian_words,count_uncontracted_words,count_uncontracted_words_bool
0,0,8,1,6,1,0,0,13,1,13,...,3,1,65,11,0,0,0,1,1,1
1,0,39,1,36,1,3,1,62,1,12,...,7,1,248,64,1,1,1,1,9,1
2,0,1,1,24,1,4,1,30,1,11,...,0,0,90,20,6,1,0,1,2,1
3,0,16,1,12,1,3,1,34,1,16,...,4,1,160,21,3,1,0,1,8,1
4,0,38,1,18,1,1,1,73,1,24,...,6,1,204,45,1,1,1,3,14,1


In [12]:
print(train_feature_based.corr(numeric_only=True)['label_bool'].sort_values(ascending=False)[1:])

pt_pt_pronoun_position_hints                    0.641433
a_plus_infinitive_count                         0.586236
count_article_before_possessive_pronoun         0.523353
count_article_before_possessive_pronoun_bool    0.452572
count_portuguese_words                          0.357739
pt_pt_pronoun_position_hints_bool               0.352184
a_plus_infinitive_count_bool                    0.174723
pt_pt_second_person_hints                       0.098025
pt_pt_second_person_hints_bool                  0.036343
count_acute_accent                              0.020159
count_uncontracted_words_bool                  -0.043707
count_uncontracted_words                       -0.105651
pt_br_second_person_hints_bool                 -0.174049
gerund_count_bool                              -0.228643
count_brazilian_words                          -0.268588
count_circumflex_accent                        -0.400293
pt_br_pronoun_position_hints                   -0.423105
pt_br_second_person_hints      

In [None]:
dev_feature_based = dev_dataset.drop(columns=['label','text','POS-tagged','seq_len'])
print(dev_feature_based.corr(numeric_only=True)['label_bool'].sort_values(ascending=False)[1:])
dev_feature_based.head()

pt_pt_pronoun_position_hints_bool               0.345190
pt_pt_pronoun_position_hints                    0.326815
a_plus_infinitive_count                         0.248535
a_plus_infinitive_count_bool                    0.247474
count_article_before_possessive_pronoun_bool    0.200895
count_article_before_possessive_pronoun         0.194943
count_portuguese_words                          0.115562
pt_pt_second_person_hints_bool                  0.071942
pt_pt_second_person_hints                       0.069528
count_acute_accent                              0.053721
count_uncontracted_words                       -0.048396
count_uncontracted_words_bool                  -0.054647
count_brazilian_words                          -0.077166
count_circumflex_accent                        -0.203455
pt_br_pronoun_position_hints_bool              -0.208446
pt_br_second_person_hints_bool                 -0.245130
pt_br_pronoun_position_hints                   -0.251705
pt_br_second_person_hints      

Unnamed: 0,label_bool,pt_pt_second_person_hints,pt_pt_second_person_hints_bool,pt_br_second_person_hints,pt_br_second_person_hints_bool,pt_pt_pronoun_position_hints,pt_pt_pronoun_position_hints_bool,pt_br_pronoun_position_hints,pt_br_pronoun_position_hints_bool,gerund_count,...,a_plus_infinitive_count,a_plus_infinitive_count_bool,count_acute_accent,count_circumflex_accent,count_article_before_possessive_pronoun,count_article_before_possessive_pronoun_bool,count_portuguese_words,count_brazilian_words,count_uncontracted_words,count_uncontracted_words_bool
0,0,0,0,1,1,0,0,1,1,0,...,0,0,5,1,0,0,0,0,0,0
1,0,0,0,1,1,0,0,3,1,1,...,0,0,6,2,0,0,0,0,0,0
2,0,1,1,0,0,0,0,1,1,2,...,1,1,9,0,0,0,0,0,1,1
3,0,1,1,0,0,0,0,1,1,0,...,0,0,6,0,0,0,0,0,1,1
4,0,2,1,0,0,0,0,0,0,0,...,0,0,4,2,0,0,0,0,0,0


In [None]:
test_feature_based = test_dataset.drop(columns=['label','text','POS-tagged','seq_len'])
print(test_feature_based.corr(numeric_only=True)['label_bool'].sort_values(ascending=False)[1:])
test_feature_based.head()

pt_pt_pronoun_position_hints_bool               0.355261
pt_pt_pronoun_position_hints                    0.332280
a_plus_infinitive_count                         0.259438
a_plus_infinitive_count_bool                    0.258653
count_article_before_possessive_pronoun_bool    0.220368
count_article_before_possessive_pronoun         0.214633
count_portuguese_words                          0.097597
pt_pt_second_person_hints                       0.055611
pt_pt_second_person_hints_bool                  0.054436
count_acute_accent                              0.045891
count_uncontracted_words                       -0.040505
count_uncontracted_words_bool                  -0.042888
count_brazilian_words                          -0.059353
count_circumflex_accent                        -0.174879
pt_br_pronoun_position_hints_bool              -0.177841
pt_br_second_person_hints_bool                 -0.188993
pt_br_second_person_hints                      -0.211762
pt_br_pronoun_position_hints   

Unnamed: 0,label_bool,pt_pt_second_person_hints,pt_pt_second_person_hints_bool,pt_br_second_person_hints,pt_br_second_person_hints_bool,pt_pt_pronoun_position_hints,pt_pt_pronoun_position_hints_bool,pt_br_pronoun_position_hints,pt_br_pronoun_position_hints_bool,gerund_count,...,a_plus_infinitive_count,a_plus_infinitive_count_bool,count_acute_accent,count_circumflex_accent,count_article_before_possessive_pronoun,count_article_before_possessive_pronoun_bool,count_portuguese_words,count_brazilian_words,count_uncontracted_words,count_uncontracted_words_bool
0,0,0,0,0,0,0,0,0,0,0,...,0,0,4,3,0,0,0,0,0,0
1,0,0,0,1,1,0,0,1,1,0,...,0,0,8,0,0,0,0,0,1,1
2,0,2,1,0,0,0,0,1,1,2,...,0,0,9,1,0,0,0,0,2,1
3,0,1,1,0,0,0,0,0,0,3,...,0,0,10,0,0,0,0,0,0,0
4,0,1,1,0,0,0,0,0,0,1,...,0,0,5,0,0,0,0,0,0,0


### Model Training

In [None]:
def train_and_evaluate(input, labels, model, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(input, labels, test_size=0.2, random_state=random_state)
    # print number of test samples per class
    # print("Number of test samples per class: ", np.bincount(y_train))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), confusion_matrix(y_test, y_pred)

In [None]:
def train_and_evaluate_split(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), confusion_matrix(y_test, y_pred)

#### Multinomial Naive Bayes

In [None]:
method = "lr"
classifier = MultinomialNB() if method == "mb" else LogisticRegression()
evaluation = "test"

test = dev_feature_based if evaluation == "dev" else test_feature_based

X_train, X_test, y_train, y_test = train_feature_based.drop(columns=['label_bool']), test.drop(columns=['label_bool']), train_feature_based['label_bool'], test['label_bool']

metrics = train_and_evaluate_split(X_train, X_test, y_train, y_test, classifier)

In [None]:
print('Accuracy: ', metrics[0])
print('Precision: ', metrics[1])
print('Recall: ', metrics[2])
print('F1: ', metrics[3])
print('Confusion Matrix: ', metrics[4])

Accuracy:  0.7471241033969414
Precision:  0.7328072153325818
Recall:  0.784813522940703
F1:  0.7579192848351364
Confusion Matrix:  [[5191 2133]
 [1604 5850]]


In [None]:
# print 5 missclassified samples
X_train, X_test, y_train, y_test = train_test_split(feature_based.drop(columns=['label_bool']), dataset['label_bool'], test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
missclassified = np.where(y_test != y_pred)[0][:10]
# print the missclassified samples
for i in missclassified:
    print('Text: ', dataset['text'][i])
    print('Label: ', dataset['label'][i])
    print('------------------------')

Text:  Pat Mitchell: Qual a história desse broche? Madeleine Albright: Significa romper barreiras. PM: Oh. Eu diria que foi uma boa escolha para o TEDWomen (TEDMulheres). MA: Gasto a maior parte do tempo quando me levanto pela manhã tentando imaginar o que vai acontecer. E essa história de broches não teria acontecido se não fosse pelo Saddam Hussein. Vou contar-lhe o que aconteceu. Fui para as Nações Unidas como embaixadora. E foi logo após a Guerra do Golfo. Eu era uma embaixadora instruída. E o cessar-fogo havia sido traduzido em uma série de resoluções por sanções, e minhas instruções eram de dizer coisas terríveis sobre Saddam Hussein constantemente, o que ele merecia -- ele havia invadido outro país. E então, de repente, um poema apareceu nos jornais de Bagdad comparando-me a várias coisas, mas entre elas uma inigualável serpente. E, por acaso, eu possuia um broche de serpente. Então resolvi usá-lo quando falávamos sobre o Iraque. (Risos) E quando fui encontrar-me com a imprensa,

#### N-Gram Model

In [None]:
filtered_dataset = dataset.copy()
filtered_dataset.text = filtered_dataset.text.apply(lambda x: x.lower())
filtered_dataset.text = filtered_dataset.text.apply(lambda x: re.sub(r'\([^)]*\)', '', x))
filtered_dataset.text = filtered_dataset.text.apply(lambda x: re.sub(r'[^\w\s]', '', x))
filtered_dataset.head()

Unnamed: 0,text,label,label_bool,seq_len,pt_pt_second_person_hints,pt_pt_second_person_hints_bool,pt_br_second_person_hints,pt_br_second_person_hints_bool,pt_pt_pronoun_position_hints,pt_pt_pronoun_position_hints_bool,...,a_plus_infinitive_count,a_plus_infinitive_count_bool,count_acute_accent,count_circumflex_accent,count_article_before_possessive_pronoun,count_article_before_possessive_pronoun_bool,count_portuguese_words,count_brazilian_words,count_uncontracted_words,count_uncontracted_words_bool
0,é realmente uma grande honra ter a oportunidad...,BR,0,18,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,eu fui muito aplaudido por esta conferência e ...,BR,0,25,0,0,0,0,0,0,...,1,1,1,1,0,0,0,0,0,0
2,e quero dizer sinceramente porque eu preciso ...,BR,0,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,e hoje tenho que tirar meus sapatos ou botas p...,BR,0,13,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,eu vou contar uma rápida história para mostra...,BR,0,17,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0


In [None]:
def get_ngrams(train_df, eval_df, ngram_range=(1, 6), max_features=10000, show=False):
    ngrams = []

    for n in range(1, ngram_range[1] + 1):
        vectorizer = CountVectorizer(ngram_range=(1, n), max_features=max_features)
        train_ngram = vectorizer.fit_transform(train_df['text'])
        eval_ngram = vectorizer.transform(eval_df['text'])
        if show:
            print(f"{n}-grams train shape:", train_ngram.shape)
        ngrams.append((train_ngram, eval_ngram))

    return ngrams

def get_ngrams_results(ngrams, train_labels, eval_labels, show=False, max_features=10000, classifier=MultinomialNB()):
    results = []
    for i in range(len(ngrams)):
        metrics = train_and_evaluate_split(ngrams[i][0], ngrams[i][1], train_labels, eval_labels, classifier)
        metrics = [metrics[0], metrics[1], metrics[2], metrics[3], metrics[4][0][0], metrics[4][0][1], metrics[4][1][0], metrics[4][1][1], i+1]
        results.append(metrics)

    return pd.DataFrame(results, columns=['accuracy', 'precision', 'recall', 'f1', 'true_positive', 'false_positive', 'false_negative', 'true_negative', 'max_ngram'])

In [None]:
method = "mb"
classifier = MultinomialNB() if method == "mb" else LogisticRegression()
evaluation = "test"
eval_dataset = dev_dataset if evaluation == "dev" else test_dataset

In [None]:
ngrams = get_ngrams(train_dataset, eval_dataset, show=True)

1-grams train shape: (1819, 10000)
2-grams train shape: (1819, 10000)
3-grams train shape: (1819, 10000)
4-grams train shape: (1819, 10000)
5-grams train shape: (1819, 10000)
6-grams train shape: (1819, 10000)


In [None]:
df = get_ngrams_results(ngrams, train_dataset['label_bool'], eval_dataset['label_bool'], show=True, max_features=10000, classifier=classifier)
df.head(10)

Unnamed: 0,accuracy,precision,recall,f1,true_positive,false_positive,false_negative,true_negative,max_ngram
0,0.760815,0.734328,0.815563,0.772816,15768,6556,4098,18121,1
1,0.766652,0.730588,0.843107,0.782825,15416,6908,3486,18733,2
2,0.763105,0.725938,0.843557,0.780341,15248,7076,3476,18743,3
3,0.762836,0.725723,0.843242,0.780082,15243,7081,3483,18736,4
4,0.763083,0.726068,0.843152,0.780242,15256,7068,3485,18734,5
5,0.76315,0.725994,0.843557,0.780373,15250,7074,3476,18743,6


In [None]:
df.to_csv(f'/content/drive/MyDrive/PTvsBR/results/{train_prefix}-{eval_prefix}-{evaluation}-ngram-{method}-results.csv', index=False)

#### N-Gram Model with POS Tagging

In [None]:
pt_pos_tagger = joblib.load('/content/drive/MyDrive/PTvsBR/POS_tagger_brill.pkl')

def tag_sentence(sentence):
    sentence = sentence.lower()
    sentence = nltk.word_tokenize(sentence, language='portuguese')
    return pt_pos_tagger.tag(sentence)

In [None]:
train_pos_tagged = train_dataset.copy()
train_pos_tagged['text'] = train_pos_tagged['POS-tagged'].apply(lambda x: ' '.join(x.split("@@@")))
train_pos_tagged.head()

Unnamed: 0,text,label,label_bool,seq_len,POS-tagged,pt_pt_second_person_hints,pt_pt_second_person_hints_bool,pt_br_second_person_hints,pt_br_second_person_hints_bool,pt_pt_pronoun_position_hints,...,a_plus_infinitive_count,a_plus_infinitive_count_bool,count_acute_accent,count_circumflex_accent,count_article_before_possessive_pronoun,count_article_before_possessive_pronoun_bool,count_portuguese_words,count_brazilian_words,count_uncontracted_words,count_uncontracted_words_bool
0,com_PREP todas_PROADJ as_ART preocupações_N le...,BR,0,634,com_PREP@@@todas_PROADJ@@@as_ART@@@preocupaçõe...,8,1,6,1,0,...,3,1,65,11,0,0,0,1,1,1
1,eu_PROPESS tenho_V um_ART sósia_N ._. (_( riso...,BR,0,2549,eu_PROPESS@@@tenho_V@@@um_ART@@@sósia_N@@@._.@...,39,1,36,1,3,...,7,1,248,64,1,1,1,1,9,1
2,"bem_IN ,_, há_V muita_PROADJ coisa_N para_PREP...",BR,0,995,"bem_IN@@@,_,@@@há_V@@@muita_PROADJ@@@coisa_N@@...",1,1,24,1,4,...,0,0,90,20,6,1,0,1,2,1
3,estou_V aqui_ADV para_PREP apresentar_V minhas...,BR,0,2295,estou_V@@@aqui_ADV@@@para_PREP@@@apresentar_V@...,16,1,12,1,3,...,4,1,160,21,3,1,0,1,8,1
4,este_PROADJ celular_ADJ começou_V sua_PROADJ t...,BR,0,2513,este_PROADJ@@@celular_ADJ@@@começou_V@@@sua_PR...,38,1,18,1,1,...,6,1,204,45,1,1,1,3,14,1


In [None]:
dev_pos_tagged = dev_dataset.copy()
dev_pos_tagged['text'] = dev_pos_tagged['POS-tagged'].apply(lambda x: ' '.join(x.split("@@@")))
dev_pos_tagged.head()

Unnamed: 0,text,label,label_bool,seq_len,POS-tagged,pt_pt_second_person_hints,pt_pt_second_person_hints_bool,pt_br_second_person_hints,pt_br_second_person_hints_bool,pt_pt_pronoun_position_hints,...,a_plus_infinitive_count,a_plus_infinitive_count_bool,count_acute_accent,count_circumflex_accent,count_article_before_possessive_pronoun,count_article_before_possessive_pronoun_bool,count_portuguese_words,count_brazilian_words,count_uncontracted_words,count_uncontracted_words_bool
0,cerca_PREP de_PREP dois_NUM anos_N e_KC meio_N...,BR,0,14,cerca_PREP@@@de_PREP@@@dois_NUM@@@anos_N@@@e_K...,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,"que_PRO-KS-REL é_V sobre_PREP samantha_NPROP ,...",BR,0,13,que_PRO-KS-REL@@@é_V@@@sobre_PREP@@@samantha_N...,0,0,0,0,0,...,0,0,2,1,0,0,0,0,0,0
2,e_KC porque_ADV ela_PROPESS não_ADV pode_VAUX ...,BR,0,27,e_KC@@@porque_ADV@@@ela_PROPESS@@@não_ADV@@@po...,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,"como_KS músico_N ,_, engenheiro_N e_KC criado_...",BR,0,18,"como_KS@@@músico_N@@@,_,@@@engenheiro_N@@@e_KC...",0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
4,"seu_PROADJ nome_N é_V aiva_V ,_, e_KC ela_PROP...",BR,0,28,"seu_PROADJ@@@nome_N@@@é_V@@@aiva_V@@@,_,@@@e_K...",0,0,1,1,0,...,0,0,3,1,0,0,0,0,0,0


In [None]:
test_pos_tagged = test_dataset.copy()
test_pos_tagged['text'] = test_pos_tagged['POS-tagged'].apply(lambda x: ' '.join(x.split("@@@")))
test_pos_tagged.head()

Unnamed: 0,text,label,label_bool,seq_len,POS-tagged,pt_pt_second_person_hints,pt_pt_second_person_hints_bool,pt_br_second_person_hints,pt_br_second_person_hints_bool,pt_pt_pronoun_position_hints,...,a_plus_infinitive_count,a_plus_infinitive_count_bool,count_acute_accent,count_circumflex_accent,count_article_before_possessive_pronoun,count_article_before_possessive_pronoun_bool,count_portuguese_words,count_brazilian_words,count_uncontracted_words,count_uncontracted_words_bool
0,"agora_ADV ,_, você_PROPESS mudaria_V suas_PROA...",BR,0,13,"agora_ADV@@@,_,@@@você_PROPESS@@@mudaria_V@@@s...",0,0,1,1,0,...,0,0,0,2,0,0,0,0,0,0
1,"acredite_V ou_KC não_ADV ,_, as_ART pessoas_N ...",BR,0,24,"acredite_V@@@ou_KC@@@não_ADV@@@,_,@@@as_ART@@@...",0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
2,"em_PREP 1843_N ,_, lady_NPROP ada_N lovelace_N...",BR,0,36,"em_PREP@@@1843_N@@@,_,@@@lady_NPROP@@@ada_N@@@...",0,0,0,0,0,...,0,0,3,1,0,0,0,0,0,0
3,"de_PREP acordo_PREP com_PREP lovelace_NPROP ,_...",BR,0,17,de_PREP@@@acordo_PREP@@@com_PREP@@@lovelace_NP...,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,"o_ART teste_N lovelace_NPROP ,_, formalizado_P...",BR,0,13,"o_ART@@@teste_N@@@lovelace_NPROP@@@,_,@@@forma...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
method = "mb"
classifier = MultinomialNB() if method == "mb" else LogisticRegression()
evaluation = "test"
eval_pos_tagged = dev_pos_tagged if evaluation == "dev" else test_pos_tagged

In [None]:
pos_tagged_ngrams = get_ngrams(train_pos_tagged, eval_pos_tagged, show=True)

In [None]:
df = get_ngrams_results(pos_tagged_ngrams, train_pos_tagged['label_bool'], eval_pos_tagged['label_bool'], show=True, max_features=10000, classifier=classifier)
df.head(10)

Unnamed: 0,accuracy,precision,recall,f1,true_positive,false_positive,false_negative,true_negative,max_ngram
0,0.903302,0.877459,0.939496,0.907418,6346,978,451,7003,1
1,0.909325,0.874816,0.957204,0.914158,6303,1021,319,7135,2
2,0.904723,0.869831,0.95385,0.909905,6260,1064,344,7110,3
3,0.902084,0.866772,0.95224,0.907499,6233,1091,356,7098,4
4,0.901001,0.865974,0.950899,0.906452,6227,1097,366,7088,5
5,0.901069,0.86617,0.950765,0.906498,6229,1095,367,7087,6


In [None]:
df.to_csv(f'/content/drive/MyDrive/PTvsBR/results/{train_prefix}-{eval_prefix}-{evaluation}-pos-ngram-{method}-results.csv', index=False)


#### Adaptive Version

In [None]:
ADAPTIVE_TRAIN_ITERATIONS = 4

def adaptive_train(X_train, y_train, model, size, show=False):
  model.partial_fit(X_train[:size], y_train[:size], classes=[0, 1])
  # Iterate over subsets using groupby
  a = size // 10
  for i in range(1, len(y_train) // size):
    if show:
      print(f"Training subset {i} ({i*size}:{(i+1)*size}/{len(y_train)})...")
    X_subset, y_subset = list(X_train[i*size:(i+1)*size].toarray()), list(y_train[i*size:(i+1)*size])
    for iteration in range(ADAPTIVE_TRAIN_ITERATIONS):
      if len(y_subset) == 0:
        if show:
          print("All elements processed")
        break
      predictions = model.predict_proba(X_subset)
      indexes = top_indexes(predictions, lambda x: abs(x[0] - x[1]), a)
      if len(indexes) == 0:
        if show:
          print("Not enough confidence.")
        break
      X_removed = [X_subset.pop(index) for index in reversed(indexes)]
      y_removed = [y_subset.pop(index) for index in reversed(indexes)]
      model.partial_fit(X_removed, y_removed)
  return model


def top_indexes(subset, criteria, n):
    subset = [(i, item) for i, item in enumerate(subset)]
    subset = sorted(subset, key=lambda x: criteria(x[1]), reverse=True)
    return list(sorted([i for i, _ in subset[:n]]))


def evaluate(X_test, y_test, model):
  y_pred = model.predict(X_test)
  return accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), confusion_matrix(y_test, y_pred)

def get_model(ngrams, y, size=512, model_key="mb", show=False):
  models = []
  for i in range(len(ngrams)):
    if show:
      print(f"Training ngrams with max size {i+1}")
    new_model = adaptive_train(ngrams[i][0], y, MultinomialNB() if model_key=="mb" else LogisticRegression(), size, show=show)
    models.append(new_model)

  return models


def adaptive_results(ngrams, y, models):
  results = []
  for i in range(len(ngrams)):
    metrics = evaluate(ngrams[i][1], y, models[i])
    metrics = [metrics[0], metrics[1], metrics[2], metrics[3], metrics[4][0][0], metrics[4][0][1], metrics[4][1][0], metrics[4][1][1], i+1]
    results.append(metrics)

  return pd.DataFrame(results, columns=['accuracy', 'precision', 'recall', 'f1', 'true_positive', 'false_positive', 'false_negative', 'true_negative', 'max_ngram'])

In [None]:
from sklearn.utils import shuffle

evaluation = "dev"
eval_dataset = dev_dataset if evaluation=="dev" else test_dataset
train_shuffled = shuffle(train_dataset.copy(), random_state=42)
train_shuffled.reset_index(inplace=True, drop=True)
ngrams = get_ngrams(train_shuffled, eval_dataset, show=True)

1-grams train shape: (53316, 10000)
2-grams train shape: (53316, 10000)
3-grams train shape: (53316, 10000)
4-grams train shape: (53316, 10000)
5-grams train shape: (53316, 10000)
6-grams train shape: (53316, 10000)


In [None]:
method = "mb"
size = 1024 * 16

saved_models = []

for i in range(10):
  models = get_model(ngrams, train_shuffled['label_bool'], size=size, show=False)
  saved_models.append(models)
  df = adaptive_results(ngrams, eval_dataset['label_bool'], models)
  df.to_csv(f'/content/drive/MyDrive/PTvsBR/results/{results_dir}/{train_prefix}-{eval_prefix}-{evaluation}-adaptive-{size}-splits-ngram-{method}-results.csv', index=False)
  if size <= 16:
    break
  size = size // 2

#### Adaptive w/ POS Tagging

In [None]:
from sklearn.utils import shuffle

evaluation = "dev"
eval_pos_tagged = dev_pos_tagged if evaluation=="dev" else test_pos_tagged
train_shuffled = shuffle(train_pos_tagged.copy(), random_state=42)
train_shuffled.reset_index(inplace=True, drop=True)
pos_ngrams = get_ngrams(train_shuffled, eval_pos_tagged, show=True)

1-grams train shape: (53316, 10000)
2-grams train shape: (53316, 10000)
3-grams train shape: (53316, 10000)
4-grams train shape: (53316, 10000)
5-grams train shape: (53316, 10000)
6-grams train shape: (53316, 10000)


In [None]:
method = "mb"
size = 1024 * 16

for i in range(10):
  models = get_model(pos_ngrams, train_shuffled['label_bool'], size=size, show=False)
  saved_models.append(models)
  df = adaptive_results(pos_ngrams, eval_pos_tagged['label_bool'], models)
  df.to_csv(f'/content/drive/MyDrive/PTvsBR/results/{results_dir}/{train_prefix}-{eval_prefix}-{evaluation}-pos-adaptive-{size}-splits-ngram-{method}-results.csv', index=False)
  if size <= 16:
    break
  size = size // 2

#### N-Grams with Neural Networks

In [None]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.l1 = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU()
        )
        self.l2 = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU()
        )
        self.l3 = nn.Sequential(
            nn.Linear(hidden_size, num_classes),
            nn.Sigmoid()
        )

    def forward(self, x):
        out = self.l1(x)
        out = self.l2(out)
        out = self.l3(out)
        return out

In [None]:
model = Net(1000, 100, 1)
summary(model, (1000,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 10]         100,010
       BatchNorm1d-2                   [-1, 10]              20
              ReLU-3                   [-1, 10]               0
            Linear-4                   [-1, 10]             110
       BatchNorm1d-5                   [-1, 10]              20
              ReLU-6                   [-1, 10]               0
            Linear-7                    [-1, 1]              11
           Sigmoid-8                    [-1, 1]               0
Total params: 100,171
Trainable params: 100,171
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.04
Forward/backward pass size (MB): 0.00
Params size (MB): 0.38
Estimated Total Size (MB): 0.42
----------------------------------------------------------------


In [None]:
learning_rate = 1e-3
batch_size = 256
criterion = nn.BCELoss()
epochs = 5
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
def train_model(model, criterion, optimizer, train_loader, val_loader, epochs=10, device='cpu'):
    train_loss_history = []
    val_loss_history = []
    for epoch in range(epochs):
        model.train()
        number_of_batches = 0
        train_loss_history.append(0)

        for X, y in train_loader:
            X = X.to(device)
            y = y.to(device).unsqueeze(1).to(torch.float32)
            optimizer.zero_grad()
            y_pred = model(X.float())
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

            train_loss_history[-1] += loss.item()
            number_of_batches += 1

        train_loss_history[-1] /= number_of_batches

        model.eval()
        number_of_batches = 0
        val_loss_history.append(0)
        with torch.no_grad():
            for X, y in val_loader:
                X = X.to(device)
                y = y.to(device).unsqueeze(1).to(torch.float32)
                y_pred = model(X.float())
                loss = criterion(y_pred, y)
                val_loss_history[-1] += loss.item()
                number_of_batches += 1

        val_loss_history[-1] /= number_of_batches
        print('Epoch: {} - Train Loss: {:.6f} - Val Loss: {:.6f}'.format(epoch+1, train_loss_history[-1], val_loss_history[-1]))

    return model, train_loss_history, val_loss_history

In [None]:
train_datasets = []
val_datasets = []
untagged_ngrams = get_ngrams(dataset, show=True, max_features=10000)
for i in range(len(untagged_ngrams)):
    X_train, X_val, y_train, y_val = train_test_split(untagged_ngrams[i], dataset['label_bool'], test_size=0.2, random_state=42)
    train_datasets.append(TensorDataset(torch.from_numpy(X_train.toarray()).float(), torch.from_numpy(y_train.values).long()))
    val_datasets.append(TensorDataset(torch.from_numpy(X_val.toarray()).float(), torch.from_numpy(y_val.values).long()))

train_loaders = []
val_loaders = []
for i in range(len(train_datasets)):
    train_loaders.append(DataLoader(train_datasets[i], batch_size=batch_size, shuffle=True))
    val_loaders.append(DataLoader(val_datasets[i], batch_size=batch_size, shuffle=False))

print(train_loaders[0].dataset.tensors[0].shape)
print(val_loaders[0].dataset.tensors[0].shape)

(14628, 22)
(14628, 343)
(14628, 4067)
(14628, 10000)
(14628, 10000)
(14628, 10000)
torch.Size([11702, 22])
torch.Size([2926, 22])


In [None]:
models = []
train_losses = []
val_losses = []
for i in range(len(train_loaders)):
    print(f'Ngram {i+1}:')
    model = Net(train_loaders[i].dataset.tensors[0].shape[1], 10, 1).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    _, train_loss_history, val_loss_history = train_model(model, criterion, optimizer,
                                                          train_loaders[i], val_loaders[i], epochs=epochs, device=device)
    models.append(model)
    train_losses.append(train_loss_history)
    val_losses.append(val_loss_history)
    print()
    print("-"*100)
    print()

Ngram 1:
Epoch: 1 - Train Loss: 0.702200 - Val Loss: 0.684011
Epoch: 2 - Train Loss: 0.677422 - Val Loss: 0.671201
Epoch: 3 - Train Loss: 0.664212 - Val Loss: 0.661673
Epoch: 4 - Train Loss: 0.655630 - Val Loss: 0.656022
Epoch: 5 - Train Loss: 0.650367 - Val Loss: 0.652641

----------------------------------------------------------------------------------------------------

Ngram 2:
Epoch: 1 - Train Loss: 0.677902 - Val Loss: 0.655199
Epoch: 2 - Train Loss: 0.641968 - Val Loss: 0.634802
Epoch: 3 - Train Loss: 0.624921 - Val Loss: 0.630987
Epoch: 4 - Train Loss: 0.613532 - Val Loss: 0.628821
Epoch: 5 - Train Loss: 0.606799 - Val Loss: 0.630357

----------------------------------------------------------------------------------------------------

Ngram 3:
Epoch: 1 - Train Loss: 0.667001 - Val Loss: 0.657250
Epoch: 2 - Train Loss: 0.620409 - Val Loss: 0.650926
Epoch: 3 - Train Loss: 0.580085 - Val Loss: 0.658806
Epoch: 4 - Train Loss: 0.536740 - Val Loss: 0.681644
Epoch: 5 - Train Loss: 0.

In [None]:
import matplotlib.pyplot as plt

for i in range(len(train_losses)):
    plt.plot(train_losses[i], label=f'ngram {i+1}')
    plt.plot(val_losses[i], label=f'ngram {i+1} val')
    plt.legend()
    plt.show()

In [None]:
# per model, run the val set and get accuracy, precision, recall, f1

def get_metrics(model, val_loader, device='cpu'):
    model.eval()
    y_pred = []
    y_true = []
    with torch.no_grad():
        for X, y in val_loader:
            X = X.to(device)
            y = y.to(device).unsqueeze(1).to(torch.float32)
            y_pred.append(model(X.float()).squeeze(1).cpu().numpy())
            y_true.append(y.squeeze(1).cpu().numpy())
    y_pred = np.concatenate(y_pred)
    y_true = np.concatenate(y_true)
    y_pred = np.where(y_pred > 0.5, 1, 0)
    return accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred)

In [None]:
metrics = []
for i in range(len(val_loaders)):
    accuracy, precision, recall, f1 = get_metrics(models[i], val_loaders[i], device=device)
    metrics.append([accuracy, precision, recall, f1, i+1])
metrics_df = pd.DataFrame(metrics, columns=['accuracy', 'precision', 'recall', 'f1', 'ngram'])
metrics_df.head(10)

Unnamed: 0,accuracy,precision,recall,f1,ngram
0,0.623035,0.609406,0.648818,0.628494,1
1,0.638414,0.641369,0.599444,0.619698,2
2,0.596036,0.574679,0.684979,0.625,3
3,0.573137,0.553846,0.675939,0.608832,4
4,0.55434,0.546592,0.546592,0.546592,5
5,0.584757,0.576213,0.586231,0.581179,6


In [None]:
metrics_df.to_csv('ngrams/pos-ngram-nn-results.csv', index=False)

## Error Analysis

In [None]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=10000)
bigrams = bigram_vectorizer.fit_transform(pos_tagged['text'])
classifier = MultinomialNB()

X_train, X_test, y_train, y_test = train_test_split(bigrams, pos_tagged['label_bool'], test_size=0.2, random_state=42)
_, aux_X_test, _, _ = train_test_split(dataset['text'], dataset['label_bool'], test_size=0.2, random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
# get wrong predictions and print text
wrong = np.where(y_pred != y_test)[0]
label_to_text = lambda x: 'PT' if x == 1 else 'BR'
missclassified = []
for i in wrong:
    # print example from aux_X_test
    print(f"Predicted: {label_to_text(y_pred[i])} - True: {label_to_text(y_test.iloc[i])}")
    print(aux_X_test.iloc[i])
    print()
    missclassified.append([aux_X_test.iloc[i], y_pred[i], y_test.iloc[i]])

missclassified_df = pd.DataFrame(missclassified, columns=['text', 'predicted', 'true'])
missclassified_df.head(10, )