# NLP with Disaster Tweets

Tutorial: https://www.kaggle.com/philculliton/nlp-getting-started-tutorial  
Get the data here: https://www.kaggle.com/c/nlp-getting-started/data?select=train.csv

In [2]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [3]:
# LOAD DATA

train_df = pd.read_csv("data/train.csv") 
test_df = pd.read_csv("data/test.csv")

In [7]:
train_df.shape # 7613 tweets

(7613, 5)

In [6]:
train_df.sample(5) # sample tweets

Unnamed: 0,id,keyword,location,text,target
6112,8726,sinking,HOMRA.,In your eyes I see the hope\nI once knew.\nI'm...,0
4706,6690,landslide,,@RonWyden Democrats restricted Blacks from Vo...,0
6952,9974,tsunami,IG : Sincerely_TSUNAMI,It's my senior year I just wanna go all out,0
2699,3873,detonation,New York,2015 new fashion ladies gold watch waterproof ...,0
830,1207,blizzard,,the best thing at DQ is the cotton candy blizz...,0


In [28]:
train_df[train_df.target == 1]["text"].values[3] # example target tweet

'13,000 people receive #wildfires evacuation orders in California '

In [27]:
train_df[train_df.target == 0]["text"].values[3] # example non-target tweet

'My car is so fast'

## Prep the data

In [29]:
count_vectorizer = feature_extraction.text.CountVectorizer() # converts text to a matrix of token counts, sparse

In [40]:
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5]) # convert the first 5 tweets

# todense() converts sparse vectors into dense vectors

print(example_train_vectors[4].todense().shape)  # we got 54 tokens from the first 5 tweets
print(example_train_vectors[4].todense())

(1, 54)
[[0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 1 0 0 0 0 1 0 1 1 0 1 0 1 0 0 1]]


In [43]:
# Prepare the train and test vectors

train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"]) 

# for test set use transform() instead of fit_transform() so that the train and test use the same set of tokens

## Model

In [44]:
# ridge regression

clf = linear_model.RidgeClassifier()

In [45]:
# train the model using cross validation, and check the scores (F1 metric)

scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.59421842, 0.56498283, 0.64113893])

In [46]:
clf.fit(train_vectors, train_df["target"]) # fit

RidgeClassifier()

In [47]:
# LOAD the sample submission csv

sample_submission = pd.read_csv("data/sample_submission.csv")

In [49]:
sample_submission["target"] = clf.predict(test_vectors) # generate predictions on test set

In [50]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [51]:
sample_submission.to_csv("submission.csv", index=False) # save submission to csv