In [None]:
from google.colab import drive
import torch
from torchsummary import summary
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device in use: {device}")
import sklearn
import pandas
import re
import tqdm
import numpy as np
import torch.utils.data as data_utils
from tqdm import tqdm
!pip install boto3 sentencepiece sacremoses transformers alive-progress
from alive_progress import alive_bar
import boto3
import requests
import regex
import sentencepiece
import sacremoses
import transformers
RANDOM_SEED=0

In [None]:
## Mount google drive
drive.mount('/content/drive/')

In [None]:
!unzip /content/drive/MyDrive/Projects/Kaggle-Disaster-Tweets/data/nlp-getting-started.zip

# Data Preprocessing




### Load data

In [None]:
# Load data
from sklearn.utils import shuffle

df_train = pandas.read_csv("train.csv")
df_test = pandas.read_csv("test.csv")
df_test["target"] = 0
print(f"Number of samples for training: {len(df_train)}")
print(f"Number of samples for testing: {len(df_test)}")
print("Training data structure:")
print(df_train.keys())
print(df_train.head())

In [7]:
df_train = shuffle(df_train, random_state=0)

### Data augmentation

In [None]:
!pip install nlpaug
import nlpaug.flow as naf
import nlpaug.augmenter.word as naw

flow = naf.Sequential([
    naw.BackTranslationAug(device="cuda"),
    naw.SynonymAug(aug_p=0.3)
])

print(f"Number of samples for training before aug: {len(df_train)}")
percent_samples_aug = 0.2
print(int(len(df_train) * percent_samples_aug))
df_aug = df_train.iloc[0:int(len(df_train) * percent_samples_aug), :].copy()
for text_idx in tqdm(range(len(df_aug["text"]))):
  text = df_aug.iloc[text_idx]["text"]
  aug_text = flow.augment(text)
  df_aug.iloc[text_idx]["text"] = aug_text

print(f"Number of samples for training after aug: {len(df_train)}")

df_aug.to_csv("train_aug.csv", index=False)


In [69]:
df_aug = pandas.read_csv("train_aug.csv")

In [70]:
df_train = pandas.concat([df_train, df_aug], ignore_index=True)
df_train = shuffle(df_train, random_state=0)

In [71]:
print(len(df_train))

9135


### Self-made preprocessing

In [None]:
## Data preprocessing
def df_text_preprocessing(df):
  sentences = [re.sub("[@#'.,!?-]", '', text.lower()) for text in df["text"]]
  print(sentences[0:5])
  words = [list(set(" ".join(sentence))) for sentence in sentences]
  df["words"] = words
  return df

In [None]:
df_train = df_text_preprocessing(df_train)
print(df_train["words"][2])
print(df_train["text"][2])

### Preprocessing BERT with Hugginface



In [8]:
from transformers import BertModel, BertTokenizer

In [9]:
def df_tokenize_bert(texts, tokenizer, max_length=None):
  if max_length is None:
    max_length_info = max([len(str(text)) for text in texts])
    print(f"Max length info of: {max_length_info}")
    max_length = max_length_info
  attention_mask = [[1 if idx < len(str(text)) else 0 for idx in range(max_length)] for text in texts]
  tokens =[]
  attention_mask = []
  # TODO: use batch_encode_plus for faster extraction
  for text in texts:
    encoding = tokenizer.encode_plus(str(text), add_special_tokens=True,max_length=max_length,padding='max_length')
    tokens.append(encoding.input_ids)
    attention_mask.append(encoding.attention_mask)


  return np.int32(tokens), np.array(attention_mask, dtype=bool), max_length

In [10]:
def df_predict_encodings(tokens, attention_mask, targets, model, batch_size=8, output_layer="pooler_output"):
  tokens_tensor = data_utils.TensorDataset(torch.tensor(tokens), torch.ByteTensor(attention_mask),torch.IntTensor(targets))
  predict_loader = data_utils.DataLoader(dataset = tokens_tensor, batch_size = batch_size, shuffle = False)  # For preprocessing
  encodings = []
  model = model.to(device)
  with alive_bar(int(len(tokens)/batch_size)) as bar:
    with torch.no_grad():
      for texts, attention_mask, _ in predict_loader:
        texts = texts.to(device)
        attention_mask = attention_mask.to(device)
        encodings_batch = model(texts, attention_mask)
        encodings_batch = getattr(encodings_batch, output_layer).cpu() # Get encodings
        encodings.extend(np.float32(encodings_batch))
        bar()


  return np.float32(encodings)

In [11]:
def load_bert_encodings(df, key="text", max_length=None, model_config="bert-base-uncased", batch_size=256, output_layer="pooler_output"):
  tokenizer = BertTokenizer.from_pretrained(model_config)
  model = BertModel.from_pretrained(model_config)
  tokens, attention_mask, max_length = df_tokenize_bert(df[key], tokenizer, max_length=max_length)
  print(f"Length tokens in use {len(tokens[0])}")
  encodings = df_predict_encodings(tokens, attention_mask, df["target"], model, batch_size, output_layer)
  print(f"Number of encondings: {len(encodings)}")
  print(f"Encondings shape: {encodings.shape}")
  return encodings, max_length

In [None]:
# Bert encodings
X_train, max_length = load_bert_encodings(df_train, key="text", max_length=300, model_config="bert-base-cased", batch_size=256, output_layer="pooler_output")
Y_train = df_train["target"]

In [None]:
X_train_keyword, max_length = load_bert_encodings(df_train, key="keyword", max_length=300, model_config="bert-base-cased", batch_size=256, output_layer="pooler_output")

In [None]:
X_test, _ = load_bert_encodings(df_test, max_length=max_length, model_config="bert-base-cased", batch_size=32, output_layer="pooler_output")


In [None]:
X_test_keyword, _ = load_bert_encodings(df_test, key="keyword",max_length=max_length, model_config="bert-base-cased", batch_size=32, output_layer="pooler_output")

#### Join text and keyword features

In [None]:
X_train = np.concatenate((X_train, X_train_keyword), axis=-1)
X_test = np.concatenate((X_test, X_test_keyword), axis=-1)

## Train and val split

In [None]:
from sklearn.model_selection import train_test_split
NUM_SAMPLES_VALIDATION = 500
X_val = X_train[0:NUM_SAMPLES_VALIDATION]
Y_val = Y_train[0:NUM_SAMPLES_VALIDATION]
X_train = X_train[NUM_SAMPLES_VALIDATION:]
Y_train = Y_train[NUM_SAMPLES_VALIDATION:]
# X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.10, random_state=RANDOM_SEED)

# Finetuning

## SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
sampling_strategy = "over"

if sampling_strategy == "over":
  pipeline = SMOTE(random_state=0)
elif sampling_strategy == "under":
  pipeline = RandomUnderSampler(sampling_strategy="majority", random_state=0)
elif sampling_strategy == "both":
  over = SMOTE(sampling_strategy=0.85, random_state=0)
  under = RandomUnderSampler(sampling_strategy="majority", random_state=0)

  steps = [('o', over), ('u', under)]
  pipeline = Pipeline(steps=steps)

X_train, Y_train = pipeline.fit_resample(X_train, Y_train)

## PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=64)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_val = pca.transform(X_val)

## SVM

In [None]:
from sklearn import svm

clf = svm.SVC()
clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_val)
Y_test_pred = clf.predict(X_test)

## KNN


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train, Y_train)

Y_pred = knn_classifier.predict(X_val)
Y_test_pred = knn_classifier.predict(X_test)

## Random forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_val)
Y_test_pred = clf.predict(X_test)

# Networks

In [None]:
# TODO: Test basic MLP

## BERT

In [None]:
# TODO: Training BERT, small transformers or LSTM...etc

# Testing

In [None]:
from sklearn.metrics import precision_recall_fscore_support
def eval_test(y_true, y_pred):
  precission, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
  print(f"Population {len(y_true)}")
  print(f"F1-score\t| Precission\t| Recall")
  print(f"{(fscore*100).round(2)}%\t\t| {(precission*100).round(2)}%\t| {(recall*100).round(2)}%")
  return precission, recall, fscore

In [None]:
precission, recall, fscore = eval_test(Y_val.values, Y_pred)

## Save model

In [None]:
import pickle
pickle.dump(pca, open("pca.sav", 'wb'))
# Reload clf
# loaded_model = pickle.load(open("model.sav", 'rb'))

## Submit



In [83]:
df_submission = pandas.read_csv("sample_submission.csv")
df_submission.loc[:,"target"] = Y_test_pred
df_submission.to_csv("submission.csv", index=False)