In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import joblib

pt_pos_tagger = joblib.load('path/to/pos/tagger/POS_tagger_brill.pkl')

import warnings
warnings.filterwarnings('ignore')

In [None]:
dir = "path/to/data"
prefix = "prefix"
dataset = pd.read_csv(f'{dir}/{prefix}-raw.csv', sep=';')
dataset = dataset.dropna()
dataset = dataset.reset_index(drop=True)
dataset['label_bool'] = dataset['label'].map({'PT': 1, 'BR': 0})

### Length Based Filtering

In [None]:
# get length of all the messages in the train set
dataset["seq_len"] = dataset["text"].apply(lambda x: len(x.split()))
# histogram of sequence length, adjust range to better visualize
axes = dataset["seq_len"].hist(bins=100, range=[0, 150])
axes.set_xlabel("Sequence length")
axes.set_ylabel("Number of entries")

In [None]:
# remove all samples with length less than 10 words
dataset = dataset[dataset["seq_len"] >= 10]
dataset.reset_index(inplace=True, drop=True)
print("Number of samples:", len(dataset))
print("Number of PT samples:", len(dataset[dataset["label"] == "PT"]))
print("Number of BR samples:", len(dataset[dataset["label"] == "BR"]))

In [None]:
dataset["seq_len"].hist(bins=100, range=[0, 150])

### Quality Based Filtering

In [None]:
# remove all samples with special |, #, _, ;, :, $, @, &, /, \, 'Clique', 'cookie', 'clique', 'Cookie'
dataset = dataset[~dataset["text"].str.contains("\|")]
dataset = dataset[~dataset["text"].str.contains("#")]
dataset = dataset[~dataset["text"].str.contains("_")]
#dataset = dataset[~dataset["text"].str.contains(";")]
#dataset = dataset[~dataset["text"].str.contains(":")]
dataset = dataset[~dataset["text"].str.contains("\$")]
dataset = dataset[~dataset["text"].str.contains("@")]
dataset = dataset[~dataset["text"].str.contains("&")]
dataset = dataset[~dataset["text"].str.contains("/")]
dataset = dataset[~dataset["text"].str.contains("\\\\")]
dataset = dataset[~dataset["text"].str.contains("Clique")]
dataset = dataset[~dataset["text"].str.contains("cookie")]
dataset = dataset[~dataset["text"].str.contains("clique")]
dataset = dataset[~dataset["text"].str.contains("Cookie")]
dataset.reset_index(inplace=True, drop=True)
print("Number of samples:", len(dataset))
print("Number of PT samples:", len(dataset[dataset["label"] == "PT"]))
print("Number of BR samples:", len(dataset[dataset["label"] == "BR"]))

In [None]:
# remove any entry with the same text for both languages
dataset = dataset.drop_duplicates(subset=['text'], keep=False)
print("Number of samples:", len(dataset))
print("Number of PT samples:", len(dataset[dataset["label"] == "PT"]))
print("Number of BR samples:", len(dataset[dataset["label"] == "BR"]))

### Features

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import features
dataset = features.get_features(dataset)

In [None]:
print("Features correlation with label:")
print()

print(dataset.corr(numeric_only=True)["label_bool"].sort_values(ascending=False)[1:])

In [None]:
dataset.to_csv(f'{dir}/{prefix}-features.csv', index=False)

In [None]:
# print examples with count_uncontracted_words > 0
print("Examples with count_uncontracted_words > 0:")
print()
print(dataset[dataset["count_uncontracted_words"] > 0][["text", "count_uncontracted_words", "label"]].head(2))
print(dataset["text"][4])
print(features.tag_sentence(dataset["text"][4]))