In [0]:
# !pip install fastai -q

# Finetune language model on domain specific documents

In [0]:
from fastai import *
from fastai.text import *
from fastai.vision import load_learner
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [0]:
working_dir = 'drive/My Drive/kaggle_disaster'
data_path = Config.data_path()
name = f'disaster_wiki'
path = data_path/name
path.mkdir(exist_ok=True, parents=True)
lm_fns = [f'disaster_wt', f'disaster_wt_vocab']

In [0]:
def preprocessing(text):
    text = text.lower()
    text = html.unescape(text)
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|(\w+:\/\/\S+)|(<.*?>)"," ",text).split())
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|(#)|(\w+:\/\/\S+)|(<.*?>)"," ",text).split())
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    emoji_pattern.sub(r'', text)
    return text

### Load the data

In [4]:
df_train = pd.read_csv(f'{working_dir}/data/train.csv')
df_test = pd.read_csv(f'{working_dir}/data/test.csv')
df = pd.concat([df_train, df_test])
df.loc[df.text.isna()] = ''
# df['text'] = df.text.apply(lambda text: preprocessing(text))
df.dropna()
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Forward finetuned language model

In [5]:
# Load until this cell and skip to 'Train the Text Classifier using the finetuned language model as encoder' section if not doing the finetune lm again
data_lm = (TextList.from_df(df, "./", cols='text')
          .split_by_rand_pct(0.1, seed=42)
          .label_for_lm()
          .databunch(bs=128))

In [0]:
learn_lm = language_model_learner(data_lm, AWD_LSTM, pretrained=True, drop_mult=1.0).to_fp16()

In [0]:
lr = 1e-3
lr *= 128/48

In [12]:
learn_lm.fit_one_cycle(4, lr*10, moms=(0.8, 0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.783614,3.47984,0.400279,00:02
1,3.997932,3.197451,0.437444,00:02
2,3.565916,3.112645,0.449386,00:02
3,3.303634,3.102633,0.451953,00:02


In [8]:
learn_lm.unfreeze()
learn_lm.fit_one_cycle(8, lr, moms=(0.8, 0.7))

epoch,train_loss,valid_loss,accuracy,time
0,4.749576,4.006441,0.355301,00:03
1,4.181105,3.376462,0.421875,00:03
2,3.781356,3.132309,0.450167,00:03
3,3.43433,2.996869,0.467913,00:03
4,3.166739,2.944044,0.474275,00:03
5,2.953372,2.913728,0.481083,00:03
6,2.777068,2.902611,0.483371,00:03
7,2.658154,2.903527,0.483817,00:03




In [0]:
learn_lm.save(f'{os.getcwd()}/{working_dir}/model/disaster_fine_tuned')
learn_lm.save_encoder(f'{os.getcwd()}/{working_dir}/model/disaster_fine_tuned_enc')

### Train the Text Classifier using the finetuned language model as encoder

In [0]:
train, test = train_test_split(df_train, test_size=0.2)

#### Define databunch and text classifier learner

In [11]:
bs=128
print(f"Unique class: {train.target.unique()}")
data_clas = (TextList.from_df(df_train, Config.data_path(),vocab=data_lm.vocab, cols='text')
            .split_by_rand_pct(0.05, seed=42)
            .label_from_df(cols='target')
            .databunch(bs=bs))

Unique class: [0 1]


In [0]:
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=1.0, metrics=[accuracy]).to_fp16()
learn_c.load_encoder(f'{os.getcwd()}/{working_dir}/model/disaster_fine_tuned_enc')
learn_c.freeze()

In [0]:
lr = 2e-3
lr *= bs/48

#### Start training

In [14]:
learn_c.fit_one_cycle(4, lr, moms=(0.8, 0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.583299,0.490728,0.760526,00:01
1,0.551499,0.472554,0.784211,00:01
2,0.538909,0.472433,0.784211,00:01
3,0.524632,0.464117,0.797368,00:01


In [15]:
learn_c.freeze_to(-2)
learn_c.fit_one_cycle(4, slice(lr/(2.6**4), lr), moms=(0.8, 0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.519825,0.448001,0.784211,00:01
1,0.526998,0.453196,0.794737,00:01
2,0.514819,0.432424,0.802632,00:01
3,0.500237,0.426611,0.813158,00:01




In [16]:
learn_c.freeze_to(-3)
learn_c.fit_one_cycle(4, slice(lr/(2.6**4), lr), moms=(0.8, 0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.493986,0.418491,0.805263,00:02
1,0.484938,0.42358,0.813158,00:02
2,0.473894,0.414082,0.810526,00:02
3,0.462834,0.408381,0.823684,00:02




In [17]:
learn_c.unfreeze()
learn_c.fit_one_cycle(4, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))

epoch,train_loss,valid_loss,accuracy,time
0,0.454659,0.407112,0.815789,00:03
1,0.460303,0.403994,0.821053,00:03
2,0.458281,0.405509,0.821053,00:03
3,0.453642,0.406618,0.823684,00:03




In [32]:
test['pred'] = test.text.apply(lambda t: learn_c.predict(t)[0].data.tolist())

print(classification_report(test['target'], test['pred']))
print(confusion_matrix(test['target'], test['pred']))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87       884
           1       0.84      0.78      0.81       639

    accuracy                           0.85      1523
   macro avg       0.85      0.84      0.84      1523
weighted avg       0.85      0.85      0.85      1523

[[792  92]
 [139 500]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


#### Save the model

In [0]:
# export model
learn_c.to_fp32().export(f'{os.getcwd()}/{working_dir}/model/cls_model.pkl')

### Create Submission

In [37]:
df_pred = pd.read_csv(f'{working_dir}/data/test.csv')
df_pred.head(2)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."


In [38]:
learn_c = load_learner(f'{os.getcwd()}/{working_dir}/model', file='cls_model.pkl')
df_pred['target'] = df_pred.text.apply(lambda t: learn_c.predict(t)[0].data.tolist())

In [39]:
df_pred.head(5)

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1


In [36]:
df_pred = df_pred[['id', 'target']]
df_pred.to_csv(f'{os.getcwd()}/{working_dir}/submission/submission-v1.csv')

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
