In [69]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.naive_bayes import MultinomialNB
import string
import csv
from nltk.corpus import stopwords
import re

Read in test and training data

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Let's create a function to remove all stop words and punctuation from our data and tokenize it

In [106]:
my_stopwords = set(stopwords.words('english') + ['ATUSER','URL'])
def text_process(text):
    text = text.lower() # convert text to lower-case
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', text) # remove URLs
    text = re.sub('@[^\s]+', 'ATUSER', text) # remove usernames
    text = re.sub('\b(0*(?:[1-9][0-9]?|100))\b', 'smallnumber', text) # group all 'small' numbers
    text = re.sub('/^(?!100$)[1-9][0-9]{2,}$/', 'bignumber', text) # group all 'big' numbers
    text = [char for char in text if char not in string.punctuation] # remove punctuation
    text = ''.join(text)
    text = re.sub('\b(0*(?:[1-9][0-9]?|100))\b', 'smallnumber', text) # group all 'small' numbers
    text = re.sub('\b(?!(?:\d{1,2}|100)$)[0-9]\d+\b', 'bignumber', text) # group all 'big' numbers
    return [word for word in text.split() if word not in my_stopwords]

Here's what that looks like for the first 5 tweets

In [107]:
train_df['text'].head(5).apply(text_process)

0    [deeds, reason, earthquake, may, allah, forgiv...
1        [forest, fire, near, la, ronge, sask, canada]
2    [residents, asked, shelter, place, notified, o...
3    [13000, people, receive, wildfires, evacuation...
4    [got, sent, photo, ruby, alaska, smoke, wildfi...
Name: text, dtype: object

Now lets apply that to the whole DataFrame and vectorize the tweets with a count vectorizer

In [23]:
vectorizer = feature_extraction.text.CountVectorizer(analyzer=text_process)
bow_transformer_train = vectorizer.fit(train_df['text'])
text_bow_train = bow_transformer_train.transform(train_df['text'])

# bow_transformer_test = vectorizer.transform(test_df['text'])
# text_bow_test = bow_transformer_test.transform(test_df['text'])

Use TF-IDF to weight the importance of words found in tweets.

In [24]:
tfidf_transformer_train = feature_extraction.text.TfidfTransformer().fit(text_bow_train)
#tfidf_transformer_test = feature_extraction.text.TfidfTransformer().fit(text_bow_test)

Let's see what the inverse document frequency of the words 'fire' and 'photo' are

In [25]:
print(tfidf_transformer_train.idf_[bow_transformer_train.vocabulary_['fire']])
print(tfidf_transformer_train.idf_[bow_transformer_train.vocabulary_['photo']])

4.850147601710058
7.372794579480906


In [26]:
tweets_tfidf_train = tfidf_transformer_train.transform(text_bow_train)
#tweets_tfidf_test = tfidf_transformer_test.transform(text_bow_test)

We are now representing the tweets as vectors and can begin to create and train a model. We will use a multinominal naive bayes model.

In [27]:
disaster_tweet_model = MultinomialNB().fit(tweets_tfidf_train, train_df['target'])

We will use a pipeline to make predictions for the test set

In [95]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
   ( 'bow',feature_extraction.text.CountVectorizer(analyzer=text_process)),
    ('tfidf',feature_extraction.text.TfidfTransformer()),
    ('classifier',MultinomialNB(alpha=0.01)),
])

pipeline.fit(train_df["text"],train_df["target"])
test_predictions = pipeline.predict(test_df["text"])
train_predictions = pipeline.predict(train_df["text"])

Lets predict our score

In [96]:
from sklearn.metrics import classification_report
print(classification_report(train_predictions,train_df["target"]))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      4415
           1       0.96      0.99      0.98      3198

    accuracy                           0.98      7613
   macro avg       0.98      0.98      0.98      7613
weighted avg       0.98      0.98      0.98      7613



Finally, we will generate our predictions file

In [55]:
with open('submission.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["id", "target"])
    for i in range(len(test_df.index)):
        writer.writerow([test_df["id"][i], test_predictions[i]])