In [1]:
# import standard libraries
import pandas as pd
import numpy as np

# import spacy for NLP and re for regular expressions
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re

# import sklearn transformers, models and pipelines
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

# import distributions for randomized grid search
from scipy.stats import uniform, randint

# Load the small language model from spacy
nlp = spacy.load('en_core_web_sm')

# set pandas text output to 400
pd.options.display.max_colwidth = 400

# Bag of Words & Naive Bayes
## Load and Prepare Data
There are duplicate rows. Some have the same text and target, while others only have the same text but different target. 

For those with the same target, only one of the duplicate rows should be kept in order to only have unique observations.

For those rows with the same text and different target, it is better to drop all rows, as it would be hard to manually relabel the rows or to check for the correct label.

We can see that this way 128 rows are removed.

In [2]:
# load data
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample_submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

# print shape of datasets
print('Train shape: {}'.format(train.shape))
print('Test shape: {}'.format(test.shape))
print('Sample submission shape: {}'.format(sample_submission.shape))

# inspect train set
train.head()

Train shape: (7613, 5)
Test shape: (3263, 4)
Sample submission shape: (3263, 2)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [3]:
# find duplicate rows with same text and target, keep only the first
train.drop_duplicates(subset = ['text', 'target'], inplace = True)

# some rows have the same text, but different targets
# drops all of these rows
train.drop_duplicates(subset = 'text', keep = False, inplace = True)

# print new shape of train set
print('Train shape: {}'.format(train.shape))

Train shape: (7485, 5)


## Create Machine Learning Pipeline
The first step is creating a machine learning pipeline using the `make_pipeline`function from scikit-learn. Creating a pipeline is important to have a robust workflow. For example, it ensures that all preprocessing steps that are learned on data are done within the cross-validation, to ensure that no data is leaked to the model. 

In this case, I'm using a `CountVectorizer` to turn the text into a high-dimensional sparse matrix. It uses a bag of words approach, where the bag of words contains each word in the entire train set. This will be the columns of the matrix. Then, for each row corresponding to a tweet, if the word is within the tweet it will have the entry 1, else 0.

In [4]:
# create machine learning pipeline
nb_pipe = make_pipeline(
    CountVectorizer(),
    MultinomialNB())

## Baseline Model
The next step is to create a baseline model. This is just doing a cross-validation on the raw train set using the pipeline created before, without any other data preparation steps. It serves to verify how well the data preparation steps improve the model performance, if at all. 

One important thing to note here is that there is a large discrepancy between the scores I achieved in cross-validation and the scores achieved on the public leaderboard. In this case, the baseline model scores around 0.795 on the public leaderboard.

In [5]:
# create train set, test set and target
X_train = train.text
X_test = test.text
y_train = train.target

In [6]:
# cross validate
print('F1 score: {:.3f}'.format(np.mean(cross_val_score(nb_pipe, X_train, y_train, scoring = 'f1'))))

# fit pipeline
nb_pipe.fit(X_train, y_train)

# predict on test set
pred = nb_pipe.predict(X_test)

# submit prediction
sample_submission.target = pred
sample_submission.to_csv('naive_bayes_baseline.csv', index = False)

F1 score: 0.672


## Lemmatization
Lemmatization is a text proprocessing technique that gets the lemma for each word, which is basically like a root of the word. The advantage of this technique is that different variations of the same word will have the same lemma and will therefore be considered the same in the bag of words. This should improve the learning and generalization ability of the model.

Lemmatization doesn't require tokenization before. The reason that I have a seperate function for tokenization is that I use it to remove stop words, which are words that appear so commonly that they don't carry any meaning or predictive power in the model.

In [7]:
def tokenize(string, stop_words):
    """
    Tokenize a document passed as a string, remove stop words and 
    return all tokens as a single document in the same order.
    """
    
    # Create a document object
    doc = nlp(string)

    # Generate tokens
    tokens_with_stopwords = [token.text for token in doc]
    
    # remove stop words
    tokens = [token for token in tokens_with_stopwords if token not in stop_words]

    # Convert tokens into a string and return it
    return ' '.join(tokens)

def lemmatize(string):
    """
    Lemmatize a document passed as a string and return all lemmas as a document in the same order.
    """
    # Create a document object
    doc = nlp(string)

    # Generate tokens
    lemmas = [token.lemma_ for token in doc]

    # Convert tokens into a string and return it
    return ' '.join(lemmas)

# tokenize the train and test set
X_train = X_train.apply(tokenize, stop_words = STOP_WORDS)
X_test = X_test.apply(tokenize, stop_words = STOP_WORDS)

# lemmatize the train and test set
X_train = X_train.apply(lemmatize)
X_test = X_test.apply(lemmatize)

# create target
y_train = train.target.copy()

In [8]:
# cross validate
print('F1 score: {:.3f}'.format(np.mean(cross_val_score(nb_pipe, X_train, y_train, scoring = 'f1'))))

# fit pipeline
nb_pipe.fit(X_train, y_train)

# predict on test set
pred = nb_pipe.predict(X_test)

# submit prediction
sample_submission.target = pred
sample_submission.to_csv('naive_bayes_spacy_pipeline.csv', index = False)

F1 score: 0.665


## Customizing the NLP Pipeline
After applying tokenization, we can see that there are still many elements in the texts which don't generalize well. This includes for example hyperlinks, mentions or numbers. I create a custom proprocessing function using regular expression to replace these by placeholder words which will be the same across all tweets. For example, instead of a hyperlink, the tweets will now contain the word HYPERLINK. 

The logic behind this is that it might not matter where the link goes to and that there won't be any generalization because it's unlikely that two tweets will have the same hyperlinks. Instead, it might be just interesting to see that a tweet has a hyperlink. 

In [9]:
def tokenize(string, stop_words):
    """
    Tokenize a document passed as a string, remove stop words and 
    return all tokens as a single document in the same order.
    """
    
    # Create a document object
    doc = nlp(string)

    # Generate tokens
    tokens_with_stopwords = [token.text for token in doc]
    
    # remove stop words
    tokens = [token for token in tokens_with_stopwords if token not in stop_words]

    # Convert tokens into a string and return it
    return ' '.join(tokens)

def preprocess(series):
    """
    Function to clean the tweets by replacing words or characters with little meaning.
    
    Replaces all hyperlinks, numbers, mentions and hashtags with a single identifier 
    (e.g. 'https://google.com' becomes 'HYPERLINK')
    
    Replaces special characters such as exclamation marks, question marks, quotation marks and brackets.
    
    Replaces double or more white spaces with a single white space.
    """
    # replace all hyperlinks
    series = series.map(lambda string: re.sub(r'http.*', 'HYPERLINK', string))

    # replace all numbers
    series = series.map(lambda string: re.sub(r'[0-9,.:]+', 'NUMBER', string))

    # replace all mentions
    series = series.map(lambda string: re.sub(r'@\w+', 'MENTION', string))

    # replace all hashtags
    series = series.map(lambda string: re.sub(r'#', 'HASHTAG', string))

    # replace all symbols
    series = series.map(lambda string: re.sub(r"[\!\?\'\"\{\[\(\)\]\}]", '', string))

    # replace all double space or more with a single space
    series = series.map(lambda string: re.sub(r'[ ][ ]+', ' ', string))
    
    # return series
    return series

# tokenize the text
X_train = train.text.apply(tokenize, stop_words = STOP_WORDS)
X_test = test.text.apply(tokenize, stop_words = STOP_WORDS)

print('Tokenized tweets: --------------------\n')
print(X_train)

# preprocess the train and test set
X_train = preprocess(X_train)
X_test = preprocess(X_test)

print('\nPreprocessed tweets: --------------------\n')
print(X_train)

# lemmatize the train and test set
X_train = X_train.apply(lemmatize)
X_test = X_test.apply(lemmatize)

print('\nLemmatized preprocessed tweets: --------------------\n')
print(X_train)

# create target
y_train = train.target.copy()

Tokenized tweets: --------------------

0                                                                               Our Deeds Reason # earthquake May ALLAH Forgive
1                                                                                       Forest fire near La Ronge Sask . Canada
2                         All residents asked ' shelter place ' notified officers . No evacuation shelter place orders expected
3                                                                13,000 people receive # wildfires evacuation orders California
4                                                              Just got sent photo Ruby # Alaska smoke # wildfires pours school
                                                                 ...                                                           
7604    # WorldNews Fallen powerlines G : link tram : UPDATE : FIRE crews evacuated 30 passengers tr ... http://t.co/EYSVvzA7Qm
7605                                                            

In [10]:
# cross validate
print('F1 score: {:.3f}'.format(np.mean(cross_val_score(nb_pipe, X_train, y_train, scoring = 'f1'))))

# fit pipeline
nb_pipe.fit(X_train, y_train)

# predict on test set
pred = nb_pipe.predict(X_test)

# submit prediction
sample_submission.target = pred
sample_submission.to_csv('naive_bayes_custom_pipeline.csv', index = False)

F1 score: 0.662


## Hyperparameter Tuning
The last step is hyperparameter tuning to get the most out of the model with the existing data preparation steps. I tune the following hyperparameters:

CountVectorizer: I see if replacing the `CountVectorizer` by the `TfidfVectorizer` improves the performance. The `TfidfVectorizer` works similar to the `CountVectorizer`, with the only difference that it ways words based on how frequent they appear in the dataset. The more frequent they appear, the less informative they are considered to be.

N-grams: N-grams consider combination of words that follow each other. This provides more context but also creates a much larger bag of words, reducing the generalization power of the model. 

Minimum document frequency: Words that appear only once in the document also don't have as much generalization power, as we would need a word to appear at least twice to learn something meaningful.

Naive Bayes alpha: The alpha is a smoothing parameter for the probabilities. Understanding how this works requires more in-depth knowledge about the math behind the Naive Bayes algorithm. For this purpose it's enough to know that the optimal range will usually be between 0.7 and 1.0 but the effect on model performance is usually low. 

In [11]:
# create a parameter grid
param_distributions = {
    'countvectorizer' : [CountVectorizer(), TfidfVectorizer(max_df = 0.8)],
    'countvectorizer__ngram_range' : [(1,1), (1,2), (1,3)],
    'countvectorizer__min_df' : [1, 2, 3],
    'multinomialnb__alpha' : uniform(loc = 0.7, scale = 0.3)
}

# create a RandomizedSearchCV object
nb_random_search = RandomizedSearchCV(
    estimator = nb_pipe,
    param_distributions = param_distributions,
    n_iter = 200,
    scoring = 'f1',
    n_jobs = -1,
    refit = True,
    verbose = 1,
    random_state = 164,
    return_train_score = True
)

# fit RandomizedSearchCV object
nb_random_search.fit(X_train, y_train)

# print grid search results
cols = ['param_countvectorizer', 
        'param_countvectorizer__min_df', 
        'param_countvectorizer__ngram_range', 
        'param_multinomialnb__alpha', 
        'mean_test_score', 
        'mean_train_score']

pd.options.display.max_colwidth = 50

nb_random_search_results = pd.DataFrame(nb_random_search.cv_results_).sort_values(by = 'mean_test_score', 
                                                                                  ascending = False)
nb_random_search_results[cols].head(10)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   46.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  4.5min finished


Unnamed: 0,param_countvectorizer,param_countvectorizer__min_df,param_countvectorizer__ngram_range,param_multinomialnb__alpha,mean_test_score,mean_train_score
88,"CountVectorizer(analyzer='word', binary=False,...",1,"(1, 1)",0.814801,0.662417,0.879288
66,"CountVectorizer(analyzer='word', binary=False,...",1,"(1, 1)",0.895266,0.662371,0.876992
115,"CountVectorizer(analyzer='word', binary=False,...",1,"(1, 1)",0.893602,0.662371,0.877074
113,"CountVectorizer(analyzer='word', binary=False,...",1,"(1, 1)",0.955471,0.662164,0.875355
19,"CountVectorizer(analyzer='word', binary=False,...",1,"(1, 1)",0.947308,0.662128,0.875633
176,"CountVectorizer(analyzer='word', binary=False,...",1,"(1, 1)",0.712271,0.662099,0.883438
106,"CountVectorizer(analyzer='word', binary=False,...",1,"(1, 1)",0.890044,0.662024,0.877167
82,"CountVectorizer(analyzer='word', binary=False,...",1,"(1, 1)",0.86904,0.661703,0.877818
142,"CountVectorizer(analyzer='word', binary=False,...",1,"(1, 1)",0.842561,0.661677,0.878586
65,"CountVectorizer(analyzer='word', binary=False,...",1,"(1, 1)",0.728939,0.661635,0.882713


In [12]:
# predict on test set with the best model from the randomized search
pred = nb_random_search.predict(X_test)

# submit prediction
sample_submission.target = pred
sample_submission.to_csv('naive_bayes_tuned.csv', index = False)

# Word2Vec & Logistic Regression
## Load and Prepare Data
Here, I do not remove them because after removing them model performance gets worse. This behavior is unintuitive, especially since there are tweets with the same text but different labels.

After that, I create a word embedding for each document using Word2Vec. Word2Vec creates a dense representation for each word, such that words appearing in similar contexts have similar vectors. To get an embedding for the entire tweet, the mean of all vectors for the words in the tweet are taken. The assumption now is that similar tweets have similar vectors.

In [13]:
# load data
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample_submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

# print shape of datasets
print('Train shape: {}'.format(train.shape))
print('Test shape: {}'.format(test.shape))
print('Sample submission shape: {}'.format(sample_submission.shape))

# inspect train set
train.head()

Train shape: (7613, 5)
Test shape: (3263, 4)
Sample submission shape: (3263, 2)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [14]:
# Load the en_core_web_lg model
nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser", "ner"])

# create train set by getting the document vector
docs_train = [nlp(doc).vector for doc in train.text]
X_train = np.vstack(docs_train)
print('Shape of train set: {}'.format(X_train.shape))

# create test set likewise
docs_test = [nlp(doc).vector for doc in test.text]
X_test = np.vstack(docs_test)
print('Shape of test set: {}'.format(X_test.shape))

# create target
y_train = train.target.copy()

Shape of train set: (7613, 300)
Shape of test set: (3263, 300)


## Create Machine Learning Pipeline
In this case, it doesn't add much value to use a pipeline since the only step in the pipeline is an estimator (here a logistic regression). However, since it's useful for pipelines with data preprocessing steps that are learned on data, such standard scaling, I even do it when it's not required.

However, one advantage even when just using an estimator is that I can treat the estimator like a hyperparameter in the grid search.

In [15]:
# create machine learning pipeline
word2vec_pipe = Pipeline([('estimator', LogisticRegression())])

# cross validate
print('F1 score: {:.3f}'.format(np.mean(cross_val_score(word2vec_pipe, X_train, y_train, scoring = 'f1'))))

# fit pipeline
word2vec_pipe.fit(X_train, y_train)

# predict on test set
pred = word2vec_pipe.predict(X_test)

# submit prediction
sample_submission.target = pred
sample_submission.to_csv('word2vec_baseline.csv', index = False)

F1 score: 0.729


## Hyperparameter Tuning
After creating the baseline, I now want to test if a more complex model works better than the logistic regression. I chose a kernel SVM in this case, as SVM models are one of the classical machine learning models commonly used for text classification.

I tune the regularization parameter C for both the logistic regression and SVM and the gamma parameter for the SVM. The hyperparameters influence the model complexity, with more complex models having a higher chance of overfitting. In case of the SVM, a more complex model can even find decision boundaries which are considered non-linear in the original feature space.

In [16]:
# create a parameter grid
param_grid = [{'estimator' : [LogisticRegression()], 
               'estimator__C' : np.logspace(-3, 3, 7)},
              {'estimator' : [SVC()], 
               'estimator__C' : np.logspace(-1, 1, 3), 
               'estimator__gamma' : np.logspace(-2, 2, 5) / X_train.shape[0]}]

# create a RandomizedSearchCV object
word2vec_grid_search = GridSearchCV(
    estimator = word2vec_pipe,
    param_grid = param_grid,
    scoring = 'f1',
    n_jobs = -1,
    refit = True,
    verbose = 1,
    return_train_score = True
)

# fit RandomizedSearchCV object
word2vec_grid_search.fit(X_train, y_train)

# print grid search results
cols = ['param_estimator',
        'param_estimator__C',
        'param_estimator__gamma',
        'mean_test_score',
        'mean_train_score']

pd.options.display.max_colwidth = 50

word2vec_grid_search_results = pd.DataFrame(word2vec_grid_search.cv_results_).sort_values(by = 'mean_test_score', 
                                                                                          ascending = False)
word2vec_grid_search_results[cols].head(10)

Fitting 5 folds for each of 22 candidates, totalling 110 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 110 out of 110 | elapsed: 13.0min finished


Unnamed: 0,param_estimator,param_estimator__C,param_estimator__gamma,mean_test_score,mean_train_score
21,"SVC(C=10.0, break_ties=False, cache_size=200, ...",10.0,0.0131354,0.734215,0.793362
2,"LogisticRegression(C=1.0, class_weight=None, d...",0.1,,0.731396,0.759984
3,"LogisticRegression(C=1.0, class_weight=None, d...",1.0,,0.728846,0.787516
16,"SVC(C=10.0, break_ties=False, cache_size=200, ...",1.0,0.0131354,0.724366,0.753945
4,"LogisticRegression(C=1.0, class_weight=None, d...",10.0,,0.719089,0.793811
20,"SVC(C=10.0, break_ties=False, cache_size=200, ...",10.0,0.00131354,0.717796,0.747382
5,"LogisticRegression(C=1.0, class_weight=None, d...",100.0,,0.716332,0.794272
6,"LogisticRegression(C=1.0, class_weight=None, d...",1000.0,,0.714957,0.793758
1,"LogisticRegression(C=1.0, class_weight=None, d...",0.01,,0.705592,0.713926
19,"SVC(C=10.0, break_ties=False, cache_size=200, ...",10.0,0.000131354,0.670485,0.680679


In [17]:
# predict on test set with the best model from the randomized search
pred = word2vec_grid_search.predict(X_test)

# submit prediction
sample_submission.target = pred
sample_submission.to_csv('word2vec_tuned.csv', index = False)