In [2]:
import random
from pathlib import Path
import pandas as pd

import spacy
from spacy.util import minibatch, compounding

datafolder = '../data/'

In [22]:
import gc

## Memory Management - Useful for Kaggle Kernel
## Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [27]:
## Loading Language Model
import spacy
# Load the en_core_web_sm model
# nlp = spacy.load('en_core_web_lg')
nlp = spacy.load('en_core_web_sm')

## Data Ingestion

In [82]:
sample_submission = pd.read_csv(datafolder + "sample_submission.csv")
test = pd.read_csv(datafolder + "test.csv")
train = pd.read_csv(datafolder + "train.csv")

In [83]:
## Filling NAs with values
fill_values = {'keyword': 'unknown', 'location': 'unknown'}
train.fillna(value=fill_values, inplace = True)
test.fillna(value=fill_values, inplace = True)

In [84]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lower_.strip() for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [98]:
# Function to remove URLs
def remove_http(string):
    words = string.split()
    out = [word for word in words if not word.startswith('http://')]
    return out

In [96]:
def text_feature_engineering(df_train, df_test):
    to_drop = [x for x in df_train.columns if x not in df_test.columns]
    full = df_train.append(df_test, sort=False)

    full['text'] = full['text'].apply(remove_http)
    full['text'] = full['text'].apply(' '.join)

    train = full.iloc[0:len(df_train)]
    test = full.iloc[len(df_train):].drop(to_drop, axis=1)
    return train, test

In [97]:
train, test = text_feature_engineering(train, test)

AttributeError: 'str' object has no attribute 'lower_'

In [88]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,unknown,unknown,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,unknown,unknown,Forest fire near La Ronge Sask. Canada,1.0
2,5,unknown,unknown,All residents asked to 'shelter in place' are ...,1.0
3,6,unknown,unknown,"13,000 people receive #wildfires evacuation or...",1.0
4,7,unknown,unknown,Just got sent this photo from Ruby #Alaska as ...,1.0
...,...,...,...,...,...
7608,10869,unknown,unknown,Two giant cranes holding a bridge collapse int...,1.0
7609,10870,unknown,unknown,@aria_ahrary @TheTawniest The out of control w...,1.0
7610,10871,unknown,unknown,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,1.0
7611,10872,unknown,unknown,Police investigating after an e-bike collided ...,1.0


## Building the model

In [89]:
def load_data(limit=0, split=0.9):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation

    train_data = list(zip(train.text, train.target))
    # train_data, _ = thinc.extra.datasets.imdb()

    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])


In [90]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}


In [None]:
# en_trf_xlnetbasecased_lg
# en_trf_bertbaseuncased_lg

def train_bert_model(model_choice):
    nlp = spacy.load(model_choice)
    print(nlp.pipe_names)
    print(f"Loaded model '{model_choice}'")
    if model_choice == "en_trf_xlnetbasecased_lg":
      textcat = nlp.create_pipe(
              "trf_textcat", config={"architecture": "softmax_class_vector"}
          )
    elif model_choice == "en_trf_bertbaseuncased_lg":
      textcat = nlp.create_pipe(
              "trf_textcat", config={"architecture": "softmax_class_vector"}
          )
    else: 
      print("Choose a supported transformer model")
    
    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")
    
    print("Labels:", textcat.labels)
    nlp.add_pipe(textcat, last=True)
    print(f"Using {len(train_texts)} training docs, {len(eval_texts)} evaluation")

In [91]:
def train_model(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load the training dataset
    print("Loading training data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

#     # test the trained model
#     test_text = "This movie sucked"
#     doc = nlp(test_text)
#     print(test_text, doc.cats)

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

#         # test the saved model
#         print("Loading from", output_dir)
#         nlp2 = spacy.load(output_dir)
#         doc2 = nlp2(test_text)
#         print(test_text, doc2.cats)
        
    print('Training Completed!  Model Saved!')


In [92]:
train_model(output_dir='../model')

Created blank 'en' model
Loading training data...
Using 2000 examples (2000 training, 762 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
9.810	0.806	0.587	0.679
1.850	0.724	0.681	0.702
0.446	0.727	0.675	0.700
0.150	0.704	0.651	0.676
0.048	0.701	0.651	0.675
0.018	0.700	0.639	0.668
0.011	0.693	0.645	0.668
0.005	0.700	0.648	0.673
0.003	0.696	0.654	0.674
0.002	0.701	0.651	0.675
0.002	0.695	0.639	0.666
0.001	0.689	0.633	0.659
0.001	0.687	0.636	0.660
0.001	0.694	0.642	0.667
0.000	0.693	0.639	0.665
0.000	0.688	0.639	0.662
0.000	0.695	0.645	0.669
0.001	0.695	0.645	0.669
0.001	0.693	0.645	0.668
0.001	0.699	0.645	0.671
Saved model to ../model
Training Completed!  Model Saved!


In [93]:
model_dir = '../model/'
print("Loading from", model_dir)
nlp2 = spacy.load(model_dir)

Loading from ../model/


In [94]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,unknown,unknown,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,unknown,unknown,Forest fire near La Ronge Sask. Canada,1.0
2,5,unknown,unknown,All residents asked to 'shelter in place' are ...,1.0
3,6,unknown,unknown,"13,000 people receive #wildfires evacuation or...",1.0
4,7,unknown,unknown,Just got sent this photo from Ruby #Alaska as ...,1.0
...,...,...,...,...,...
7608,10869,unknown,unknown,Two giant cranes holding a bridge collapse int...,1.0
7609,10870,unknown,unknown,@aria_ahrary @TheTawniest The out of control w...,1.0
7610,10871,unknown,unknown,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,1.0
7611,10872,unknown,unknown,Police investigating after an e-bike collided ...,1.0


In [110]:
def score(text):
    result = nlp2(text).cats
    if result['POSITIVE'] > result['NEGATIVE']:
        return 1
    else:
        return 0

In [111]:
test['score'] = test['text'].apply(score)

In [112]:
test

Unnamed: 0,id,keyword,location,text,score
0,0,unknown,unknown,Just happened a terrible car crash,0
1,2,unknown,unknown,"Heard about #earthquake is different cities, s...",1
2,3,unknown,unknown,"there is a forest fire at spot pond, geese are...",0
3,9,unknown,unknown,Apocalypse lighting. #Spokane #wildfires,1
4,11,unknown,unknown,Typhoon Soudelor kills 28 in China and Taiwan,1
...,...,...,...,...,...
3258,10861,unknown,unknown,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0
3259,10865,unknown,unknown,Storm in RI worse than last hurricane. My city...,0
3260,10868,unknown,unknown,Green Line derailment in Chicago,1
3261,10874,unknown,unknown,MEG issues Hazardous Weather Outlook (HWO),1


In [113]:
sample_submission["target"] = test.score

In [114]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,0
3,9,1
4,11,1


In [115]:
sample_submission.groupby('target').count()

Unnamed: 0_level_0,id
target,Unnamed: 1_level_1
0,1967
1,1296


In [116]:
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
id        3263 non-null int64
target    3263 non-null int64
dtypes: int64(2)
memory usage: 51.1 KB


In [117]:
sample_submission.to_csv("../output/submission.csv", index=False)