In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from tqdm.auto import tqdm
import pandas as pd
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
dfTrain = pd.read_csv("train.csv")
dfTest = pd.read_csv("test.csv")

submission = pd.read_csv("sample_submission.csv")

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    mention = re.compile(r'@\S+')
    translator = str.maketrans('', '', string.punctuation)
    text = url.sub(r'', text)
    text = mention.sub(r'', text)
    text = text.translate(translator)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if  not word in stop_words]
    text = ' '.join(text)
    return text


In [None]:
dfTrain['text'] = dfTrain['text'].apply(lambda x: clean_text(x))
dfTest['text'] = dfTest['text'].apply(lambda x: clean_text(x))

In [None]:
train_dataset = Dataset.from_pandas(dfTrain)
test_dataset = Dataset.from_pandas(dfTest)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_train = tokenized_train.remove_columns(["text", "keyword", "location", "id"])
tokenized_train = tokenized_train.rename_column("target", "labels")
tokenized_train.set_format("torch")

tokenized_test = tokenized_test.remove_columns(["text", "keyword", "location", "id"])
tokenized_test.set_format("torch")

In [None]:
train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=8)
test_dataloader = DataLoader(tokenized_test, shuffle=False, batch_size=1)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

In [None]:
num_epochs = 4
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print(num_training_steps)

In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
all_preds = []
model.eval()
for batch in tqdm(test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().tolist()
    all_preds = all_preds.append(predictions)

In [None]:
submission['target'] = all_preds
submission.to_csv('submission.csv', index=False)