In [2]:
!pip install transformers
!pip install datasets



In [16]:
import random
import pandas as pd
import transformers
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import re
import torch

In [17]:
random.seed(42)
np.random.seed(42)
pd.set_option('display.max_colwidth', None)

# Data

In [29]:
data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Processing

In [19]:
def add_keyword_location(data):
    data = data.copy()
    keyword_index = ~data['keyword'].isna()
    location_index = ~data['location'].isna()

    data['cleaned_text'] = data['text']
    data.loc[keyword_index, 'cleaned_text'] = data[keyword_index].apply(lambda x:'keyword:%s\n%s' % (x['keyword'],x['cleaned_text']),axis=1)
    data.loc[location_index, 'cleaned_text'] = data[location_index].apply(lambda x:'location:%s\n%s' % (x['location'],x['cleaned_text']),axis=1)
    return data['cleaned_text']

In [20]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[\[\]#@]', '', text, flags=re.MULTILINE)
    text = re.sub(r'(\d),(\d)|,', r'\1\2', text, flags=re.MULTILINE)
    return text

In [21]:
data['cleaned_text'] = add_keyword_location(data)
data['cleaned_text'] = data['cleaned_text'].apply(clean_text)

# Model

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset

In [23]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples['cleaned_text'])

In [24]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(0.2)
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.rename_column('target', 'label')

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [25]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"f1": f1}

In [70]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    report_to='none',
    output_dir="current_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_strategy="steps",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,F1
100,No log,0.409118,0.83001
200,No log,0.418615,0.81693
300,No log,0.388183,0.837946
400,No log,0.468477,0.819069
500,0.374300,0.42496,0.833209
600,0.374300,0.502223,0.819098
700,0.374300,0.490878,0.820528
800,0.374300,0.599665,0.811172
900,0.374300,0.555609,0.826729
1000,0.199000,0.603795,0.813539




# Submissions

In [32]:
subms = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [30]:
test['cleaned_text'] = add_keyword_location(test)
test['cleaned_text'] = test['cleaned_text'].apply(clean_text)

In [35]:
test_dataset = Dataset.from_pandas(test)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [54]:
prediction = trainer.predict(test_dataset).predictions
result = np.argmax(prediction, axis=1)
test['target'] = result

In [67]:
submsion = subms[['id']].merge(test[['id', 'target']], on='id')

In [69]:
submsion.to_csv('./result.csv', index=False)