# 套件載入

In [5]:
import numpy as np
import pandas as pd
import re
from datasets import Dataset
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer

# 讀取資料集

In [12]:
data = pd.read_csv('nlp-getting-started/train.csv')
test_data = pd.read_csv('nlp-getting-started/test.csv')

# 預處理

In [13]:
def keyword_preprocess(text):
    """移除 '%20'"""
    if pd.notnull(text):
        text = text.replace("%20", " ")
    else:
        text = ''
    return text

def remove_url(text):
    url_pattern = re.compile(r'https?://t\.co/[^\s]*')
    new_text = url_pattern.sub('', text)
    return new_text

def remove_at(text):
    at_pattern = re.compile(r'@[^\s]*')
    new_text = at_pattern.sub('', text)
    return new_text

def text_preprocess(text):
    """移除 url、@xxx"""
    text = remove_url(text)
    text = remove_at(text)
    return text

# remove url and @ from text
data['text'] = data['text'].apply(text_preprocess)
test_data['text'] = test_data['text'].apply(text_preprocess)

# remove %20 from keyword
data['keyword'] = data['keyword'].apply(keyword_preprocess)
test_data['keyword'] = test_data['keyword'].apply(keyword_preprocess)

# combine keyword and text
data['keyword_text'] = data.apply(lambda row: row['keyword'] + ' ' + row['text'], axis=1)
test_data['keyword_text'] = test_data.apply(lambda row: row['keyword'] + ' ' + row['text'], axis=1)

train_data_dict = {
    "text": data["keyword_text"].tolist(),
    "label": data["target"].tolist()
}

test_data_dict = {
    "text": test_data["keyword_text"].tolist()
}

train_dataset = Dataset.from_dict(train_data_dict)
test_dataset = Dataset.from_dict(test_data_dict)

# 載入預訓練模型

In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# use dynamic padding 
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# 訓練並輸出結果

In [14]:
training_args = TrainingArguments(
    "test-trainer",
    report_to='none',
    num_train_epochs=2,
    save_strategy = "epoch"
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

predictions = trainer.predict(tokenized_test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

submission = pd.DataFrame({'id':test_data['id'],'target':preds})
submission.to_csv('nlp-getting-started/submission1.csv', index=False)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.5003
1000,0.4353
1500,0.3829


In [49]:
test_data_location = pd.read_csv('nlp-getting-started/test.csv')
test_data_location['text'] = test_data_location['text'].apply(text_preprocess)
test_data_location['keyword'] = test_data_location['keyword'].apply(keyword_preprocess)
test_data_location['keyword_text'] = test_data_location.apply(lambda row: row['keyword'] + ' ' + row['text'], axis=1)
test_dataset_location = test_data_location[['keyword_text']]

test_data_dict_location = {
    "text": test_dataset_location["keyword_text"].tolist()
}
test_dataset_location = Dataset.from_dict(test_data_dict_location)
tokenized_test_dataset_location = test_dataset_location.map(tokenize_function, batched=True)
predictions = trainer.predict(tokenized_test_dataset_location)
preds = np.argmax(predictions.predictions, axis=-1)

result = pd.DataFrame({'text':test_data_location['text'],'target':preds})

print(result)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

                                                   text  target
0                    Just happened a terrible car crash       1
1     Heard about #earthquake is different cities, s...       1
2     there is a forest fire at spot pond, geese are...       1
3              Apocalypse lighting. #Spokane #wildfires       1
4         Typhoon Soudelor kills 28 in China and Taiwan       1
...                                                 ...     ...
3258  EARTHQUAKE SAFETY LOS ANGELES ??? SAFETY FASTE...       0
3259  Storm in RI worse than last hurricane. My city...       1
3260                  Green Line derailment in Chicago        1
3261        MEG issues Hazardous Weather Outlook (HWO)        1
3262  #CityofCalgary has activated its Municipal Eme...       1

[3263 rows x 2 columns]


In [59]:
import en_core_web_sm
nlp = en_core_web_sm.load()
def location_detect(text):
    doc = nlp(text)
    data = [(X.text, X.label_) for X in doc.ents]
    for word, pos in data:
        if pos == 'GPE':
            return(word)
test_data_location['location'] = test_data_location['text'].apply(location_detect)
# print(test_data_location['location'])
result = pd.DataFrame({'text':test_data_location['text'],'target':preds,'location':test_data_location['location']})
result = result[result['target'] == 1]
result = result[result['location'].notnull()]
print(result)

                                                   text  target  \
4         Typhoon Soudelor kills 28 in China and Taiwan       1   
15    Birmingham Wholesale Market is ablaze BBC News...       1   
34      Accident on A27 near Lewes is it Kingston Ro...       1   
36    For Legal and Medical Referral Service  Call u...       1   
52    'We are still living in the aftershock of Hiro...       1   
...                                                 ...     ...   
3238  Wreckage 'Conclusively Confirmed' as From MH37...       1   
3239  Wreckage 'Conclusively Confirmed' as From MH37...       1   
3254  Officials: Alabama home quarantined over possi...       1   
3257  The death toll in a #IS-suicide car bombing on...       1   
3260                  Green Line derailment in Chicago        1   

                   location  
4                     China  
15               Birmingham  
34                    Lewes  
36                    Legal  
52                Hiroshima  
...            