In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [3]:
import spacy
import nltk

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [5]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Problem
Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

But, it’s not always clear whether a person’s words are actually announcing a disaster. Take this example:

The author explicitly uses the word “ABLAZE” but means it metaphorically. This is clear to a human right away, especially with the visual aid. But it’s less clear to a machine.

## In this competition, you’re challenged to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. 

In [7]:
train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [8]:
#Lets see some example texts for tweets which are about real disasters
train[train['target']==1]['text'].values[:5]

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       '13,000 people receive #wildfires evacuation orders in California ',
       'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '],
      dtype=object)

In [9]:
train[train['target']==0]['text'].values[:5]

array(["What's up man?", 'I love fruits', 'Summer is lovely',
       'My car is so fast', 'What a goooooooaaaaaal!!!!!!'], dtype=object)

In [10]:
# Lets try making word embeddings for the words in the tweets so that similar words map to closer vectors mathematicslly

Word embeddings (also called word vectors) represent each word numerically in such a way that the vector corresponds to how that word is used or what it means. Vector encodings are learned by considering the context in which the words appear. Words that appear in similar contexts will have similar vectors. For example, vectors for "leopard", "lion", and "tiger" will be close together, while they'll be far away from "planet" and "castle".

In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")



In [12]:
text = "These vectors can be used as features for machine learning models."
with nlp.disable_pipes():
    vectors = np.array([token.vector for token in  nlp(text)])

In [13]:
vectors.shape

(12, 96)

machine learning on text requires that you first represent the text numerically. So far, you've done this with bag of words representations. But you can usually do better with word embeddings vectorization. Similar words have close numbers. There are many ways to combine all the word vectors into a single document vector we can use for model training. A simple and surprisingly effective approach is simply averaging the vectors for each word in the document. Then, you can use these document vectors for modeling.

spaCy calculates the average document vector which you can get with doc.vector. Here is an example loading the spam data and converting it to document vectors.

In [14]:
with nlp.disable_pipes():
    doc_vectors= np.array([nlp(text).vector for text in train.text])

In [15]:
doc_vectors

array([[-1.5600548 , -0.22666351, -1.8038342 , ..., -0.9291725 ,
         0.07779168,  0.49509495],
       [-1.090805  ,  0.92656183, -1.2712501 , ..., -1.8536562 ,
        -0.09444264,  0.17522642],
       [-1.2229573 , -0.5687632 , -2.1264563 , ..., -0.64594275,
         0.6485378 , -0.19080436],
       ...,
       [-1.4578587 ,  0.6063586 , -1.211155  , ..., -1.9665267 ,
         0.13041063,  0.7344017 ],
       [-1.095413  ,  0.44249234, -2.1407855 , ..., -0.5645163 ,
        -0.9296475 , -0.6333501 ],
       [-0.31156594,  1.9149263 , -1.747036  , ..., -0.261499  ,
        -1.082955  ,  0.88620836]], dtype=float32)

In [16]:
doc_vectors.shape

(7613, 96)

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(doc_vectors, train.target,
                                                    test_size=0.1, random_state=1)

In [18]:
from sklearn.svm import LinearSVC
svc=LinearSVC(random_state=1, max_iter=1000000)
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

0.699475065616798

In [19]:
print(f"Accuracy :{svc.score(X_test, y_test)*100:.3f}%")

Accuracy :69.948%


In [20]:
from sklearn import svm
rbf_k=svm.SVC(kernel='rbf')
rbf_k.fit(X_train, y_train)
rbf_k.score(X_test, y_test)



0.7309711286089239

The model gave better accuracy when we used rbf kernel in SVM instead of linear kernel. The accuracy improved from 69.9% to 73%

In [21]:
#Lets do hyper parameter tuning. We are tuning gamma and C for SVC.
from sklearn.model_selection import GridSearchCV
tuned_parameters={'gamma': [0.001, 0.01, 0.1, 1], 'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
#score=['accuracy']
clf=GridSearchCV(svm.SVC(kernel='rbf'), tuned_parameters, refit=False, return_train_score=True, cv=3)
clf.fit(X_train, y_train)
print(clf.best_params_)

{'C': 1, 'gamma': 0.01}


In [22]:
rbf_clf=svm.SVC(kernel='rbf', C=1, gamma= 0.01)
rbf_clf.fit(X_train, y_train)
rbf_clf.score(X_test, y_test)

0.7309711286089239

In [23]:
grid_predictions = rbf_clf.predict(X_test)

In [24]:
with nlp.disable_pipes():
    doc_vectors= np.array([nlp(text).vector for text in test.text])

In [None]:
test_predictions= rbf_clf.predict(test)

In [78]:
# print classification report 
from sklearn.metrics import classification_report, confusion_matrix 
print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.72      0.87      0.79       438
           1       0.75      0.55      0.63       324

   micro avg       0.73      0.73      0.73       762
   macro avg       0.74      0.71      0.71       762
weighted avg       0.73      0.73      0.72       762



In [82]:
#Lets try doing with logistic regression with parameter tuning
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV 
  
# Creating the hyperparameter grid 
c_space = np.logspace(-5, 8, 15) 
param_grid = {'C': c_space, 'penalty':["l2"], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']} 
  
# Instantiating logistic regression classifier 
logreg = LogisticRegression() 
  
# Instantiating the GridSearchCV object 
logreg_cv = GridSearchCV(logreg, param_grid, cv = 5) 
  
logreg_cv.fit(X_train, y_train) 
  
# Print the tuned parameters and score 
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))  
print("Best score is {}".format(logreg_cv.best_score_)) 

Tuned Logistic Regression Parameters: {'C': 0.05179474679231213, 'penalty': 'l2', 'solver': 'saga'}
Best score is 0.6985841482995183


In [83]:
logreg_cv.score(X_test, y_test)

0.7007874015748031

In [86]:
grid_predictions_logreg = logreg_cv.predict(X_test)
# print classification report 
print(classification_report(y_test, grid_predictions_logreg))

              precision    recall  f1-score   support

           0       0.72      0.79      0.75       438
           1       0.67      0.59      0.62       324

   micro avg       0.70      0.70      0.70       762
   macro avg       0.69      0.69      0.69       762
weighted avg       0.70      0.70      0.70       762



From SVM, SVM with hyperparameter tuning and logistic regression with hyperparameter tuning-- SVM with hyperparamater tuning gives best results with an accuracy of 73%

In [26]:
with nlp.disable_pipes():
    doc_vectors= np.array([nlp(text).vector for text in test.text])
grid_predictions_test = rbf_clf.predict(doc_vectors)

In [28]:
grid_predictions_test

array([0, 0, 0, ..., 1, 0, 0])

In [34]:
test['target']=grid_predictions_test

In [35]:
test.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,0
1,2,,,"Heard about #earthquake is different cities, s...",0
2,3,,,"there is a forest fire at spot pond, geese are...",0
3,9,,,Apocalypse lighting. #Spokane #wildfires,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1


In [37]:
submission= test[['id', 'target']]
submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,1


In [40]:
submission.to_csv('NLPDisaster_submission.svm', index=False, index_label=True)

# Lets try traditional approach where we use bag of words to convert the text into numerical values

In [98]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train["text"][0:5])

In [99]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


The above tells us that:

- There are 54 unique words (or "tokens") in the first five tweets.
- The first tweet contains only some of those unique tokens - all of the non-zero counts above are the tokens that DO exist in the first tweet.
- Now let's create vectors for all of our tweets.

In [100]:
train_vectors = count_vectorizer.fit_transform(train["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test["text"])

# We will use LSTM to implement this.

- The first layer is the embedded layer that uses 100 length vectors to represent each word.
- SpatialDropout1D performs variational dropout in NLP models.
- The next layer is the LSTM layer with 100 memory units.
- The output layer must create 2 output values, one for each class.
- Activation function is softmax for multi-class classification.
- Because it is a multi-class classification problem, categorical_crossentropy is used as the loss function.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=60, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
# evaluate baseline model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, X_train, y_train, cv=kfold)
print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))