## NLP Tutorial

NLP - or *Natural Language Processing* - is shorthand for a wide array of techniques designed to help machines learn from text. Natural Language Processing powers everything from chatbots to search engines, and is used in diverse tasks like sentiment analysis and machine translation.

In this tutorial we'll look at this competition's dataset, use a simple technique to process it, build a machine learning model, and submit predictions for a score!

In [1]:
import os
import numpy as np
import pandas as pd
import nltk
import string
import re
from collections import Counter
from sklearn import feature_extraction, linear_model, model_selection, preprocessing, decomposition
import xgboost as xgb 
!pip install contractions
import IPython
import contractions


Collecting contractions
  Downloading https://files.pythonhosted.org/packages/e1/49/fdb0d85fdcb22cdb9993986a350f7ea6e80e495dd6b874cf76b942ddbb68/contractions-0.0.58-py2.py3-none-any.whl
Collecting textsearch>=0.0.21
  Downloading https://files.pythonhosted.org/packages/d3/fe/021d7d76961b5ceb9f8d022c4138461d83beff36c3938dc424586085e559/textsearch-0.0.21-py2.py3-none-any.whl
Collecting anyascii
[?25l  Downloading https://files.pythonhosted.org/packages/6d/7b/19437c9a5bd16e1bb3a5bf43f7655e341882befceae0122e43c8e2c21e1e/anyascii-0.3.0-py3-none-any.whl (284kB)
[K     |████████████████████████████████| 286kB 808kB/s 
[?25hInstalling collected packages: anyascii, textsearch, contractions
Successfully installed anyascii-0.3.0 contractions-0.0.58 textsearch-0.0.21


In [2]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
print(train_df.keyword.unique())
print(train_df.location.unique())

[nan 'ablaze' 'accident' 'aftershock' 'airplane%20accident' 'ambulance'
 'annihilated' 'annihilation' 'apocalypse' 'armageddon' 'army' 'arson'
 'arsonist' 'attack' 'attacked' 'avalanche' 'battle' 'bioterror'
 'bioterrorism' 'blaze' 'blazing' 'bleeding' 'blew%20up' 'blight'
 'blizzard' 'blood' 'bloody' 'blown%20up' 'body%20bag' 'body%20bagging'
 'body%20bags' 'bomb' 'bombed' 'bombing' 'bridge%20collapse'
 'buildings%20burning' 'buildings%20on%20fire' 'burned' 'burning'
 'burning%20buildings' 'bush%20fires' 'casualties' 'casualty'
 'catastrophe' 'catastrophic' 'chemical%20emergency' 'cliff%20fall'
 'collapse' 'collapsed' 'collide' 'collided' 'collision' 'crash' 'crashed'
 'crush' 'crushed' 'curfew' 'cyclone' 'damage' 'danger' 'dead' 'death'
 'deaths' 'debris' 'deluge' 'deluged' 'demolish' 'demolished' 'demolition'
 'derail' 'derailed' 'derailment' 'desolate' 'desolation' 'destroy'
 'destroyed' 'destruction' 'detonate' 'detonation' 'devastated'
 'devastation' 'disaster' 'displaced' 'droug

In [4]:
twt = nltk.tokenize.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stop = nltk.corpus.stopwords.words("english") + list(string.punctuation)
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
# print(stop)

def clean_text(df, col='text', normalize='lemmatize', stopwords=True, add_keyword=False, fill_empty='NULL', shuffle=False):
    cleaned_text, pos, neg = [], [], []
    try: 
        targets = df.target
    except:
        targets = -np.ones(len(df))
        
    if add_keyword:
        df.keyword = df.keyword.str.replace("%20", " ").fillna("")
        df['text_w_key'] = df.text + " " + df.keyword
        col = 'text_w_key'
        
    for (target, text) in zip(targets, df[col]):
#         print(text)
        text = text.lower().split(" ")
        text = [word for word in text if "http" not in word]
        text = [contractions.fix(word) for word in text]
        text = " ".join(text).lower()
        text = re.sub(r'\d+|#', '', text)
        text = twt.tokenize(text)
        if stopwords:
            text = [word for word in text if word not in stop]
        text = [word for word in text if word not in ["rt", "û_", "amp", "ûª", "ûªs", "ûò", "ûï", "ûó", "åè", "ìñ1", "\x89", "...", "..", "via"]]
        if normalize == 'lemmatize':
            text = [lemmatizer.lemmatize(word) for word in text]
        if normalize == 'stem':
            text = [stemmer.stem(word) for word in text]
            
        if target == 1: 
            pos.append(text)
        if target == 0: 
            neg.append(text)
        text = " ".join(text)
        cleaned_text.append(text)
#         print(text)
        
    df["clean_text"] = cleaned_text
    if fill_empty != False:
        df.loc[df.clean_text.str.len() == 0, 'clean_text'] = fill_empty
    if shuffle:
        df = df.sample(frac=1)
    
    return pos, neg, df
        
pos_text, neg_text, train_df = clean_text(train_df, add_keyword=True, shuffle=True)
_, _, test_df = clean_text(test_df, add_keyword=True)
pos_text = [item for sublist in pos_text for item in sublist]
neg_text = [item for sublist in neg_text for item in sublist]

In [5]:
pos_common = pd.DataFrame(Counter(pos_text).most_common(60))
neg_common = pd.DataFrame(Counter(neg_text).most_common(60))
pd.concat([pos_common, neg_common], axis=1)

Unnamed: 0,0,1,0.1,1.1
0,fire,414,like,255
1,suicide,204,body,216
2,disaster,185,get,185
3,building,167,new,170
4,storm,158,emergency,163
5,emergency,143,bag,155
6,bomb,137,fire,151
7,news,135,would,144
8,bombing,134,one,133
9,police,128,want,111


In [6]:
display(train_df.loc[(train_df.clean_text == "NULL"), :])
display(test_df.loc[(test_df.clean_text == "NULL"), :])

Unnamed: 0,id,keyword,location,text,target,text_w_key,clean_text


Unnamed: 0,id,keyword,location,text,text_w_key,clean_text
13,43,,,What if?!,What if?!,


In [7]:
display(train_df.head(30))
display(test_df.head(30))

Unnamed: 0,id,keyword,location,text,target,text_w_key,clean_text
3479,4974,explosion,,Exploring New Worlds: Three Moments of an Expl...,1,Exploring New Worlds: Three Moments of an Expl...,exploring new world three moment explosion chi...
1984,2855,damage,Cheshire. London. #allover,Unions say they are supportive of 'London' yet...,0,Unions say they are supportive of 'London' yet...,union say supportive london yet prepared damag...
3001,4312,dust storm,Lizzy's Knee,I keep sneezing either someone placed a southe...,0,I keep sneezing either someone placed a southe...,keep sneezing either someone placed southern d...
3802,5403,fire truck,,#Philippines Former Township fire truck being ...,1,#Philippines Former Township fire truck being ...,philippine former township fire truck used phi...
3046,4373,earthquake,"Hawaii, USA",USGS reports a M1.94 #earthquake 5km S of Volc...,1,USGS reports a M1.94 #earthquake 5km S of Volc...,usgs report earthquake km volcano hawaii :: ut...
3730,5301,fear,"Bremerton, WA",The Walking Dead spin off Fear the Walking Dea...,1,The Walking Dead spin off Fear the Walking Dea...,walking dead spin fear walking dead rd fear
6903,9896,traumatised,,@VickyBrush LOL! I was a traumatised child. On...,0,@VickyBrush LOL! I was a traumatised child. On...,lol traumatised child wednesday release topic ...
4435,6311,hostage,"Cape Neddick, ME",@EvaHanderek @MarleyKnysh great times until th...,1,@EvaHanderek @MarleyKnysh great times until th...,great time bus driver held u hostage mall park...
5441,7761,police,,Police expand search for missing pregnant woma...,0,Police expand search for missing pregnant woma...,police expand search missing pregnant woman be...
1731,2494,collided,,It's Even Worse Than It Looks: How the America...,0,It's Even Worse Than It Looks: How the America...,even worse look american constitutional system...


Unnamed: 0,id,keyword,location,text,text_w_key,clean_text
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,happened terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...","Heard about #earthquake is different cities, s...",heard earthquake different city stay safe ever...
2,3,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are...",forest fire spot pond goose fleeing across str...
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan
5,12,,,We're shaking...It's an earthquake,We're shaking...It's an earthquake,shaking earthquake
6,21,,,They'd probably still show more life than Arse...,They'd probably still show more life than Arse...,would probably still show life arsenal yesterd...
7,22,,,Hey! How are you?,Hey! How are you?,hey
8,27,,,What a nice hat?,What a nice hat?,nice hat
9,29,,,Fuck off!,Fuck off!,fuck


### Building vectors

The theory behind the model we'll build in this notebook is pretty simple: the words contained in each tweet are a good indicator of whether they're about a real disaster or not (this is not entirely correct, but it's a great place to start).

We'll use scikit-learn's `CountVectorizer` to count the words in each tweet and turn them into data our machine learning model can process.

Note: a `vector` is, in this context, a set of numbers that a machine learning model can work with. We'll look at one in just a second.

In [8]:
feature_col = "clean_text"

count_vectorizer = feature_extraction.text.CountVectorizer()
count_vectorizer_sw = feature_extraction.text.CountVectorizer()
tfidf = feature_extraction.text.TfidfVectorizer()
LSA = decomposition.TruncatedSVD(n_components=100)

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:50])
example_train_vectors_sw = count_vectorizer_sw.fit_transform(train_df[feature_col][0:50])
example_tfidf = tfidf.fit_transform(train_df[feature_col][0:50])
example_tfidf_lsa = LSA.fit_transform(example_tfidf)

In [9]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print('No cleaning')
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())
print('Cleaned')
print(example_train_vectors_sw[0].todense().shape)
print(example_train_vectors_sw[0].todense())
print('TF-IDF cleaned')
print(example_tfidf[0].todense().shape)
print(example_tfidf[0].todense())
print('TF-IDF + LSA cleaned')
print(example_tfidf_lsa[0].shape)
print(example_tfidf_lsa[0])

No cleaning
(1, 548)
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0

In [10]:
train_vectors = count_vectorizer.fit_transform(train_df[feature_col])
train_vectors_sw = count_vectorizer_sw.fit_transform(train_df[feature_col])
train_tfidf = tfidf.fit_transform(train_df[feature_col])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df[feature_col])
test_vectors_sw = count_vectorizer_sw.transform(test_df[feature_col])
test_tfidf = tfidf.transform(test_df[feature_col])

In [11]:
train_tfidf_lsa = LSA.fit_transform(train_tfidf)
test_tfidf_lsa = LSA.transform(test_tfidf)

### Our model

As we mentioned above, we think the words contained in each tweet are a good indicator of whether they're about a real disaster or not. The presence of particular word (or set of words) in a tweet might link directly to whether or not that tweet is real.

What we're assuming here is a _linear_ connection. So let's build a linear model and see!

Let's test our model and see how well it does on the training data. For this we'll use `cross-validation` - where we train on a portion of the known data, then validate it with the rest. If we do this several times (with different portions) we can get a good idea for how a particular model or method performs.

The metric for this competition is F1, so let's use that here.

# **Ridge Classifier**

In [12]:
clf = linear_model.RidgeClassifier(class_weight='balanced')
ridge_params = {
    "alpha": np.linspace(0, 2, 100),
    "tol": np.linspace(1e-5, 1e-1, 2000)
}
ridge_rscv = model_selection.RandomizedSearchCV(clf, ridge_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, verbose=2)
ridge_rscv_lsa = model_selection.RandomizedSearchCV(clf, ridge_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, verbose=2)

In [13]:
search = ridge_rscv.fit(train_tfidf, train_df["target"])
search_lsa = ridge_rscv_lsa.fit(train_tfidf_lsa, train_df["target"])
IPython.display.clear_output()
print("Best RidgeClassifier TF-IDF")
print(search.best_score_)
print(search.best_params_)
print("Best RidgeClassifier TF-IDF LSA")
print(search_lsa.best_score_)
print(search_lsa.best_params_)

Best RidgeClassifier TF-IDF
0.7569165622434358
{'tol': 0.011614642321160582, 'alpha': 1.5555555555555556}
Best RidgeClassifier TF-IDF LSA
0.7040480628211567
{'tol': 0.07303921460730366, 'alpha': 1.191919191919192}


In [14]:
scores_tfidf = model_selection.cross_validate(clf, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

scores_tfidf_lsa = model_selection.cross_validate(clf, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

print("RidgeClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
print('RidgeClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
print("RidgeClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
print('RidgeClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
print('RidgeClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
print('RidgeClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

RidgeClassifier TF-IDF F1:               [0.75057915 0.75098193 0.7595928  0.74546887 0.75752773]
RidgeClassifier TF-IDF & LSA F1:         [0.72012336 0.69817073 0.70649739 0.69387755 0.69585987]
RidgeClassifier TF-IDF Precision:        [0.759375   0.77221325 0.77849117 0.76910569 0.78618421]
RidgeClassifier TF-IDF & LSA Precision:  [0.72741433 0.69604863 0.69051095 0.71290323 0.72591362]
RidgeClassifier TF-IDF Recall:           [0.74198473 0.73088685 0.74159021 0.72324159 0.73088685]
RidgeClassifier TF-IDF & LSA Recall:     [0.7129771  0.70030581 0.72324159 0.67584098 0.66819572]


In [15]:
scores_tfidf = model_selection.cross_validate(ridge_rscv.best_estimator_, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

scores_tfidf_lsa = model_selection.cross_validate(ridge_rscv_lsa.best_estimator_, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

print("Best RidgeClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
print('Best RidgeClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
print("Best RidgeClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
print('Best RidgeClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
print('Best RidgeClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
print('Best RidgeClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

Best RidgeClassifier TF-IDF F1:               [0.75729647 0.75253708 0.76502732 0.74822415 0.76031746]
Best RidgeClassifier TF-IDF & LSA F1:         [0.72012336 0.69923664 0.70597015 0.69592476 0.69896743]
Best RidgeClassifier TF-IDF Precision:        [0.76197836 0.76874003 0.7814992  0.77324633 0.79042904]
Best RidgeClassifier TF-IDF & LSA Precision:  [0.72741433 0.69817073 0.68950437 0.71382637 0.72727273]
Best RidgeClassifier TF-IDF Recall:           [0.75267176 0.73700306 0.74923547 0.72477064 0.7324159 ]
Best RidgeClassifier TF-IDF & LSA Recall:     [0.7129771  0.70030581 0.72324159 0.67889908 0.67278287]


# **XGB Classifier**

In [16]:
# xgb_clf = xgb.XGBClassifier(random_state=765, tree_method='gpu_hist', predictor='gpu_predictor')
# xgb_params = {
#     "max_depth": [i for i in range(4, 14)],
#     "min_child_weight": np.linspace(0.25, 0.45, 100),
#     "gamma": np.linspace(0, 0.015, 1000),
#     "learning_rate": np.linspace(0.2, 0.5, 100),
# }
# xgb_rscv = model_selection.RandomizedSearchCV(xgb_clf, xgb_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, verbose=2)
# xgb_rscv_lsa = model_selection.RandomizedSearchCV(xgb_clf, xgb_params, scoring=["f1", "precision", "recall"], refit="f1", cv=5, verbose=2)

In [17]:
# search = xgb_rscv.fit(train_tfidf, train_df["target"])
# search_lsa = xgb_rscv_lsa.fit(train_tfidf_lsa, train_df["target"])
# IPython.display.clear_output()
# print("Best XGBClassifier TF-IDF")
# print(search.best_score_)
# print(search.best_params_)
# print("Best XGBClassifier TF-IDF LSA")
# print(search_lsa.best_score_)
# print(search_lsa.best_params_)

In [18]:
# scores_tfidf = model_selection.cross_validate(xgb_clf, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])
# scores_tfidf_lsa = model_selection.cross_validate(xgb_clf, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# print("XGBClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
# print('XGBClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
# print("XGBClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
# print('XGBClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
# print('XGBClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
# print('XGBClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

In [19]:
# scores_tfidf = model_selection.cross_validate(xgb_rscv.best_estimator_, train_tfidf, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])
# scores_tfidf_lsa = model_selection.cross_validate(xgb_rscv_lsa.best_estimator_, train_tfidf_lsa, train_df["target"], cv=5, scoring=["f1", "precision", "recall"])

# print("Best XGBClassifier TF-IDF F1:              ", scores_tfidf['test_f1'])
# print('Best XGBClassifier TF-IDF & LSA F1:        ', scores_tfidf_lsa['test_f1'])
# print("Best XGBClassifier TF-IDF Precision:       ", scores_tfidf['test_precision'])
# print('Best XGBClassifier TF-IDF & LSA Precision: ', scores_tfidf_lsa['test_precision'])
# print('Best XGBClassifier TF-IDF Recall:          ',  scores_tfidf['test_recall'])
# print('Best XGBClassifier TF-IDF & LSA Recall:    ', scores_tfidf_lsa['test_recall'])

The above scores aren't terrible! It looks like our assumption will score roughly 0.65 on the leaderboard. There are lots of ways to potentially improve on this (TFIDF, LSA, LSTM / RNNs, the list is long!) - give any of them a shot!

In the meantime, let's do predictions on our training set and build a submission for the competition.

In [20]:
# clf.fit(train_tfidf_lsa, train_df["target"])

In [21]:
# xgb_clf.fit(train_tfidf_lsa, train_df["target"])

In [22]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [23]:
# sample_submission["target"] = clf.predict(test_tfidf_lsa)
train_prediction = ridge_rscv.best_estimator_.predict(train_tfidf)
sample_submission["target"] = ridge_rscv.best_estimator_.predict(test_tfidf)
# train_prediction = xgb_clf.predict(train_tfidf_lsa)
# sample_submission["target"] = xgb_clf.predict(test_tfidf_lsa)

In [24]:
train_df['pred_target'] = train_prediction

In [25]:
clean_text_wc = train_df.clean_text.str.count(' ').add(1)
short_text_incorrect = train_df.loc[(clean_text_wc < 5) & (train_df.target != train_df.pred_target), :]
(short_text_incorrect.target == 1).sum(), (short_text_incorrect.target == 0).sum()

(36, 38)

In [26]:
# display(sample_submission.head(30))
# display(test_df['text'].head(30))
pd.merge(sample_submission, test_df, on=['id']).head(60)

Unnamed: 0,id,target,keyword,location,text,text_w_key,clean_text
0,0,1,,,Just happened a terrible car crash,Just happened a terrible car crash,happened terrible car crash
1,2,1,,,"Heard about #earthquake is different cities, s...","Heard about #earthquake is different cities, s...",heard earthquake different city stay safe ever...
2,3,1,,,"there is a forest fire at spot pond, geese are...","there is a forest fire at spot pond, geese are...",forest fire spot pond goose fleeing across str...
3,9,1,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire
4,11,1,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan
5,12,1,,,We're shaking...It's an earthquake,We're shaking...It's an earthquake,shaking earthquake
6,21,0,,,They'd probably still show more life than Arse...,They'd probably still show more life than Arse...,would probably still show life arsenal yesterd...
7,22,0,,,Hey! How are you?,Hey! How are you?,hey
8,27,0,,,What a nice hat?,What a nice hat?,nice hat
9,29,0,,,Fuck off!,Fuck off!,fuck


In [27]:
sample_submission.to_csv("submission.csv", index=False)