In [1]:
from google.colab import drive
import torch
from torchsummary import summary
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Device in use: {device}")
import sklearn
import pandas
import re
import tqdm
import numpy as np
import torch.utils.data as data_utils
from tqdm import tqdm
!pip install boto3 sentencepiece sacremoses transformers alive-progress
from alive_progress import alive_bar
import boto3
import requests
import regex
import sentencepiece
import sacremoses
import transformers
RANDOM_SEED=0

Device in use: cuda:0
Collecting boto3
  Downloading boto3-1.28.73-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alive-progress
  Downloading alive_progress-3.1.4-py3-

In [2]:
## Mount google drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
!unzip /content/drive/MyDrive/Projects/Kaggle-Disaster-Tweets/data/nlp-getting-started.zip

Archive:  /content/drive/MyDrive/Projects/Kaggle-Disaster-Tweets/data/nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Data Preprocessing




### Load data

In [4]:
# Load data
df_train = pandas.read_csv("train.csv")
df_test = pandas.read_csv("test.csv")

print(f"Number of samples for training: {len(df_train)}")
print(f"Number of samples for testing: {len(df_test)}")
print("Training data structure:")
print(df_train.keys())
print(df_train.head())


Number of samples for training: 7613
Number of samples for testing: 3263
Training data structure:
Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


### Data augmentation

In [None]:
# In progress...

### Self-made preprocessing

In [5]:
## Data preprocessing
def df_text_preprocessing(df):
  sentences = [re.sub("[@#'.,!?-]", '', text.lower()) for text in df["text"]]
  print(sentences[0:5])
  words = [list(set(" ".join(sentence))) for sentence in sentences]
  df["words"] = words
  return df

In [6]:
df_train = df_text_preprocessing(df_train)
print(df_train["words"][2])
print(df_train["text"][2])

['our deeds are the reason of this earthquake may allah forgive us all', 'forest fire near la ronge sask canada', 'all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected', '13000 people receive wildfires evacuation orders in california ', 'just got sent this photo from ruby alaska as smoke from wildfires pours into a school ']
['t', 's', 'y', 'b', 'p', 'f', 'e', 'o', 'l', 'c', 'd', 'g', 'h', 'r', 'a', 'n', 'i', ' ', 'u', 'x', 'k', 'v']
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected


### Preprocessing BERT with Hugginface



In [7]:
from transformers import BertModel, BertTokenizer

In [8]:
def df_tokenize_bert(texts, tokenizer, max_length=None):
  if max_length is None:
    max_length = max([len(text) for text in texts])
  tokens = [tokenizer.encode(re.sub("[@#'.,!?-]", '', text.lower()), add_special_tokens=True, max_length=max_length, padding='max_length', truncation=True) for text in texts]

  return np.int32(tokens)


In [9]:
def df_predict_encodings(tokens, targets, model, batch_size=8, output_layer="pooler_output"):
  tokens_tensor = data_utils.TensorDataset(torch.tensor(tokens), torch.IntTensor(targets))
  predict_loader = data_utils.DataLoader(dataset = tokens_tensor, batch_size = batch_size, shuffle = False)  # For preprocessing
  encodings = []
  model = model.to(device)
  with alive_bar(len(tokens)) as bar:
    with torch.no_grad():
      for texts, _ in predict_loader:
        texts = texts.to(device)
        encodings_batch = model(texts)
        encodings_batch = getattr(encodings_batch, output_layer).cpu() # Get encodings
        encodings.extend(np.float32(encodings_batch))
        bar()


  return np.float32(encodings)

In [10]:
def load_bert_encodings(df, max_length=None, batch_size=256, output_layer="pooler_output"):
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  model = BertModel.from_pretrained('bert-base-uncased')
  tokens = df_tokenize_bert(df_train["text"], tokenizer, max_length=None)
  encodings = df_predict_encodings(tokens, df["target"], model, batch_size, output_layer)
  print(f"Number of encondings: {len(encodings)}")
  print(f"Encondings shape: {encodings.shape}")
  return encodings


In [68]:
# Bert encodings
X_train = load_bert_encodings(df_train, max_length=None, batch_size=256, output_layer="pooler_output")
Y_train = df_train["target"]

        0%|          | 0/30 [00:00<?, ?it/s]
        3%|▎         | 1/30 [00:02<00:58,  2.01s/it]
        7%|▋         | 2/30 [00:04<00:56,  2.00s/it]
       10%|█         | 3/30 [00:06<00:54,  2.01s/it]
       13%|█▎        | 4/30 [00:08<00:52,  2.01s/it]
       17%|█▋        | 5/30 [00:10<00:50,  2.02s/it]
       20%|██        | 6/30 [00:12<00:48,  2.02s/it]
       23%|██▎       | 7/30 [00:14<00:46,  2.03s/it]
       27%|██▋       | 8/30 [00:16<00:44,  2.03s/it]
       30%|███       | 9/30 [00:18<00:42,  2.03s/it]
        33%|███▎      | 10/30 [00:20<00:40,  2.03s/it]
        37%|███▋      | 11/30 [00:22<00:38,  2.03s/it]
        40%|████      | 12/30 [00:24<00:36,  2.03s/it]
        43%|████▎     | 13/30 [00:26<00:34,  2.04s/it]
        47%|████▋     | 14/30 [00:28<00:32,  2.04s/it]
        50%|█████     | 15/30 [00:30<00:30,  2.05s/it]
        53%|█████▎    | 16/30 [00:32<00:28,  2.05s/it]
        57%|█████▋    | 17/30 [00:34<00:26,  2.05s/it]
        60%|██████    | 18/30 [00:36<0

|▏⚠︎                                      | (!) 30/7613 [0%] in 1:00.9 (0.49/s) 
Number of encondings: 7613
Encondings shape: (7613, 768)


## Train and val split

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.10, random_state=RANDOM_SEED)

# Finetuning

## SMOTE

In [None]:
# In progress... SMOTE is an oversampling technique used to address data imbalance by generating synthetic samples from the underrepresented class.

## PCA

In [62]:
from sklearn.decomposition import PCA
pca = PCA(n_components=64)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_val = pca.transform(X_val)

## SVM

In [63]:
from sklearn import svm

clf = svm.SVC()
clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_val)

## Random forests

In [70]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_val)

# Networks

In [None]:
# TODO: Test basic MLP

## BERT

In [None]:
# TODO: Training BERT, small transformers or LSTM...etc

# Testing

In [85]:
from sklearn.metrics import precision_recall_fscore_support
def eval_test(y_true, y_pred):
  precission, recall, fscore, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
  print(f"F1-score\t| Precission\t| Recall")
  print(f"{(fscore*100).round(2)}%\t\t| {(precission*100).round(2)}%\t| {(recall*100).round(2)}%")
  return precission, recall, fscore


In [86]:
precission, recall, fscore = eval_test(Y_val.values, Y_pred)

F1-score	| Precission	| Recall
61.38%		| 69.05%	| 55.24%


## Save model

In [67]:
import pickle
pickle.dump(pca, open("pca.sav", 'wb'))
# Reload clf
# loaded_model = pickle.load(open("model.sav", 'rb'))