In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train = pd.read_csv('nlp-getting-started/train.csv')
test = pd.read_csv('nlp-getting-started/test.csv')

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
print('number of training data points = ', len(train))
print('number of test data points = ', len(test))

number of training data points =  7613
number of test data points =  3263


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
id          3263 non-null int64
keyword     3237 non-null object
location    2158 non-null object
text        3263 non-null object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [8]:
percent_real = train[train['target'] == 1]['target'].sum()/len(train)
percent_fake = 1 - percent_real
print('percent of tweets about real disasters = ', percent_real)
print('percent of tweets not about real disasters = ', percent_fake)

percent of tweets about real disasters =  0.4296597924602653
percent of tweets not about real disasters =  0.5703402075397347


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_dev, y_train, y_dev = train_test_split(train.drop(['target'], axis=1), train['target'], test_size=0.30, 
                                                  random_state=101)

In [11]:
# Convert text to lowercase
X_train['text'] = X_train['text'].apply(lambda x: x.lower())
X_dev['text'] = X_dev['text'].apply(lambda x: x.lower())

In [12]:
# Remove all URLs
import re
X_train['text'] = X_train['text'].apply(lambda x: re.sub(r'http\S+', '', x))
X_dev['text'] = X_dev['text'].apply(lambda x: re.sub(r'http\S+', '', x))

In [13]:
# Remove all punctuation (including @)
import string
X_train['text'] = X_train['text'].str.replace('[{}]'.format(string.punctuation), '')
X_dev['text'] = X_dev['text'].str.replace('[{}]'.format(string.punctuation), '')

In [14]:
X_train['text'] = X_train['text'].str.split()
X_dev['text'] = X_dev['text'].str.split()

In [15]:
# Remove all stopwords using the NLTK stopwords package
from nltk.corpus import stopwords
stop = stopwords.words('english')
X_train['text'] = X_train['text'].apply(lambda x: [item for item in x if item not in stop])
X_dev['text'] = X_dev['text'].apply(lambda x: [item for item in x if item not in stop])

In [16]:
#print(stop)

In [17]:
# OLD: Lemmatize/Stem text using the Snowball Stemmer

# from nltk.stem.snowball import SnowballStemmer
# stemmer = SnowballStemmer("english")
# X_train['text_stemmed'] = X_train['text'].apply(lambda x: [stemmer.stem(y) for y in x])
# X_dev['text_stemmed'] = X_dev['text'].apply(lambda x: [stemmer.stem(y) for y in x])

In [18]:
# Lemmatize text using the WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
X_train['text_stemmed'] = X_train['text'].apply(lambda x: [wnl.lemmatize(y) for y in x])
X_dev['text_stemmed'] = X_dev['text'].apply(lambda x: [wnl.lemmatize(y) for y in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/eeshakhanna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
X_train['text_stemmed'] = X_train['text'].apply(lambda x: [wnl.lemmatize(y, pos='v') for y in x])
X_dev['text_stemmed'] = X_dev['text'].apply(lambda x: [wnl.lemmatize(y, pos='v') for y in x])

In [20]:
X_train['text_stemmed'] = X_train['text'].apply(lambda x: [wnl.lemmatize(y, pos='a') for y in x])
X_dev['text_stemmed'] = X_dev['text'].apply(lambda x: [wnl.lemmatize(y, pos='a') for y in x])

In [21]:
#X_train['text_stemmed']

In [22]:
X_train['text_stemmed_str'] = X_train['text_stemmed'].apply(lambda x: " ".join(x))
X_dev['text_stemmed_str'] = X_dev['text_stemmed'].apply(lambda x: " ".join(x))

In [23]:
X_train.head()

Unnamed: 0,id,keyword,location,text,text_stemmed,text_stemmed_str
2707,3889,detonation,New York,"[detonation, fashionable, mountaineering, elec...","[detonation, fashionable, mountaineering, elec...",detonation fashionable mountaineering electron...
6479,9266,sunk,"Cardiff, Wales","[benaffleck, respected, liked, talent, guess, ...","[benaffleck, respected, liked, talent, guess, ...",benaffleck respected liked talent guess stil h...
4499,6396,hurricane,Berlin - Germany,"[lavapixcom, see, hurricane, guillermo, meteoe...","[lavapixcom, see, hurricane, guillermo, meteoe...",lavapixcom see hurricane guillermo meteoearth
7181,10290,weapon,,"[fur, leather, coats, sprite, amp, weapon, cho...","[fur, leather, coats, sprite, amp, weapon, cho...",fur leather coats sprite amp weapon choice lif...
4250,6038,heat%20wave,USA,"[heat, advisory, effect, 1, pm, 7, pm, thursda...","[heat, advisory, effect, 1, pm, 7, pm, thursda...",heat advisory effect 1 pm 7 pm thursday buildi...


# Bag of Words Model

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
min_df_bow = 11
count_vect = CountVectorizer(binary=True, min_df=min_df_bow, analyzer='word')
X_train_vocab = count_vect.fit_transform(X_train['text_stemmed_str']).toarray()
X_dev_vocab = count_vect.transform(X_dev['text_stemmed_str']).toarray()
print(X_train_vocab.shape)
print(X_dev_vocab.shape)

(5329, 948)
(2284, 948)


In [25]:
print('Index for the word "smoke": ', count_vect.vocabulary_.get(u'smoke'))

Index for the word "smoke":  752


# Logistic Regression

### No Regularization

In [26]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(penalty='none')
logmodel.fit(X_train_vocab,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(penalty='none')

In [27]:
predictions = logmodel.predict(X_dev_vocab)

In [28]:
# Error metrics - F1 Score
# Double check with sklearn metric and to print false positives
def err_metric(CM):      
    TN = CM.iloc[0,0]
    FN = CM.iloc[1,0]
    TP = CM.iloc[1,1]
    FP = CM.iloc[0,1]
    precision =(TP)/(TP+FP)
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    recall = (TP)/(TP+FN)
    f1_score = 2*(( precision * recall)/( precision + recall))
    print("f1 score of the model: ",f1_score)
    print("false positives ", FP)
    print("false negatives ", FN)

confusion_matrix = pd.crosstab(predictions,y_dev)
#confusion_matrix_nb = pd.crosstab(predictions_nb, y_dev)
err_metric(confusion_matrix)
#err_metric(confusion_matrix_nb)

f1 score of the model:  0.7123727905731119
false positives  289
false negatives  248


In [29]:
from sklearn.metrics import classification_report
print(classification_report(y_dev,predictions))
from sklearn.metrics import f1_score
print(f1_score(y_dev, predictions))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1330
           1       0.73      0.70      0.71       954

    accuracy                           0.76      2284
   macro avg       0.76      0.76      0.76      2284
weighted avg       0.76      0.76      0.76      2284

0.7123727905731119


In [30]:
predictions_train = logmodel.predict(X_train_vocab)
print(classification_report(y_train,predictions_train))
print(f1_score(y_train, predictions_train))

              precision    recall  f1-score   support

           0       0.86      0.91      0.89      3012
           1       0.88      0.81      0.84      2317

    accuracy                           0.87      5329
   macro avg       0.87      0.86      0.86      5329
weighted avg       0.87      0.87      0.87      5329

0.8417040358744395


### L1 Regularization

In [31]:
logmodel_l1 = LogisticRegression(penalty='l1', solver='liblinear')
logmodel_l1.fit(X_train_vocab,y_train)

LogisticRegression(penalty='l1', solver='liblinear')

In [32]:
predictions_l1 = logmodel_l1.predict(X_dev_vocab)

In [33]:
print(classification_report(y_dev,predictions_l1))
print(f1_score(y_dev, predictions_l1))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83      1330
           1       0.79      0.68      0.73       954

    accuracy                           0.79      2284
   macro avg       0.79      0.77      0.78      2284
weighted avg       0.79      0.79      0.79      2284

0.727580372250423


In [34]:
predictions_train_l1 = logmodel_l1.predict(X_train_vocab)
print(classification_report(y_train,predictions_train_l1))
print(f1_score(y_train, predictions_train_l1))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      3012
           1       0.88      0.75      0.81      2317

    accuracy                           0.85      5329
   macro avg       0.86      0.84      0.84      5329
weighted avg       0.85      0.85      0.85      5329

0.8122372723026623


### L2 Regularization

In [35]:
logmodel_l2 = LogisticRegression(penalty='l2')
logmodel_l2.fit(X_train_vocab,y_train)

LogisticRegression()

In [36]:
predictions_l2 = logmodel_l2.predict(X_dev_vocab)

In [37]:
print(classification_report(y_dev,predictions_l2))
print(f1_score(y_dev, predictions_l2))

              precision    recall  f1-score   support

           0       0.79      0.86      0.83      1330
           1       0.78      0.69      0.73       954

    accuracy                           0.79      2284
   macro avg       0.79      0.77      0.78      2284
weighted avg       0.79      0.79      0.79      2284

0.7295036252091466


In [38]:
predictions_train_l2 = logmodel_l2.predict(X_train_vocab)
print(classification_report(y_train,predictions_train_l2))
print(f1_score(y_train, predictions_train_l2))

              precision    recall  f1-score   support

           0       0.84      0.92      0.88      3012
           1       0.88      0.77      0.82      2317

    accuracy                           0.85      5329
   macro avg       0.86      0.84      0.85      5329
weighted avg       0.86      0.85      0.85      5329

0.8198614318706697


### Print Coefficients

In [39]:
coef = np.transpose(logmodel_l1.coef_).flatten()

In [40]:
vocab = count_vect.vocabulary_
df = pd.DataFrame.from_dict(data=vocab, orient='index')
df.reset_index(inplace=True)
df.rename(columns={0: "count", "index": "word"}, inplace=True)

In [41]:
df['coef'] = coef

In [42]:
type(df.shape[0])

int

In [43]:
df['coef'].max()

3.8971429665875177

In [44]:
df['coef'].min()

-2.3859198026904607

In [45]:
# Get top 5% words with highest coefficients 
num_coef = int(5*(df.shape[0])/100)
imp_words = df.nlargest(num_coef, 'coef')['word']

In [46]:
imp_words

402        picking
224         caught
859         twelve
475         taking
784          lives
525        history
33            must
263       collapse
765    obliterated
528       pakistan
213            new
194         hijack
420           kids
372        failure
193         entire
288           past
599      explosion
442           east
264          words
877          green
909         middle
132            hes
453       upheaval
112           lets
704        crushed
836        tonight
254             st
238            via
516          thats
168            hey
553          obama
145          radio
699        drought
410           riot
837         chance
315     evacuation
548          place
136        weather
212        reddits
777        outside
200     responders
335        islamic
803      wednesday
551           hell
351          whole
746           ball
779         madhya
Name: word, dtype: object

# Bernoulli Naive Bayes

In [47]:
# BNB implementation 
n = X_train_vocab.shape[0] # size of the dataset
d = X_train_vocab.shape[1] # number of features in our dataset
K = 2 # number of clases
alpha = 1

# these are the shapes of the parameters
psis = np.zeros([K,d])
phis = np.zeros([K])

# we now compute the parameters
for k in range(K):
    X_k = X_train_vocab[y_train == k]
    #psis[k] = np.mean(X_k, axis=0)
    psis[k] = (np.sum(X_k, axis=0) + alpha) / (X_k.shape[0] + (2*alpha)) #laplace smoothing
    #psis[k] = ((np.mean(X_k, axis=0) * X_k.shape[0]) + alpha)/(X_k.shape[0] + (2*alpha)) 
    phis[k] = X_k.shape[0] / float(n)

# print out the class proportions
print(phis)
print(psis.shape)

[0.56520923 0.43479077]
(2, 948)


In [48]:
def nb_predictions(x, psis, phis):
    """This returns class assignments and scores under the NB model.
    We compute \arg\max_y p(y|x) as \arg\max_y p(x|y)p(y)."""
    
    # adjust shapes
    n, d = x.shape
    x = np.reshape(x, (1, n, d))
    psis = np.reshape(psis, (K, 1, d))
    
    # clip probabilities to avoid log(0)
    psis = psis.clip(1e-14, 1-1e-14)
    
    # compute log-probabilities
    logpy = np.log(phis).reshape([K,1])
    logpxy = x * np.log(psis) + (1-x) * np.log(1-psis)
    logpyx = logpxy.sum(axis=2) + logpy

    return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K,n])

In [49]:
predictions_nb, logpyx = nb_predictions(X_dev_vocab, psis, phis)

In [50]:
predictions_nb_train, logpyx_train = nb_predictions(X_train_vocab, psis, phis)

In [51]:
print(classification_report(y_dev, predictions_nb))
print(f1_score(y_dev, predictions_nb))

              precision    recall  f1-score   support

           0       0.78      0.88      0.83      1330
           1       0.80      0.66      0.72       954

    accuracy                           0.79      2284
   macro avg       0.79      0.77      0.78      2284
weighted avg       0.79      0.79      0.78      2284

0.7223502304147464


In [52]:
print(classification_report(y_train, predictions_nb_train))
print(f1_score(y_train, predictions_nb_train))

              precision    recall  f1-score   support

           0       0.79      0.92      0.85      3012
           1       0.87      0.69      0.77      2317

    accuracy                           0.82      5329
   macro avg       0.83      0.80      0.81      5329
weighted avg       0.83      0.82      0.82      5329

0.7679730704496273


# N-gram Model

In [53]:
from sklearn.feature_extraction.text import CountVectorizer
min_df_ngram = 12
count_vect_ngram = CountVectorizer(binary=True, min_df=min_df_ngram, ngram_range=(1,2))

X_train_vocab_ngram = count_vect_ngram.fit_transform(X_train['text_stemmed_str']).toarray()
X_dev_vocab_ngram = count_vect_ngram.transform(X_dev['text_stemmed_str']).toarray()
print(X_train_vocab_ngram.shape)
print(X_dev_vocab_ngram.shape)

(5329, 1006)
(2284, 1006)


In [58]:
# Print number of 1-grams and 2-grams
no_one_grams = 0
no_two_grams = 0
vocab = count_vect_ngram.vocabulary_
gram = vocab.keys()
value_iterator = iter(gram)
for i in range(len(vocab)):
    val = next(value_iterator)
    if ' ' in val:
        no_two_grams = no_two_grams + 1
    else:
        no_one_grams = no_one_grams + 1
print('number of 1-grams = ', no_one_grams)
print('number of 2-grams = ', no_two_grams)

number of 1-grams =  870
number of 2-grams =  136


In [60]:
# Print 10 2-grams
two_grams = []
count = 0
gram2 = count_vect_ngram.vocabulary_.keys()
value_iterator_2 = iter(gram2)
while count < 10:
    val = next(value_iterator_2)
    if ' ' in val:
        two_grams.append(val)
        count = count + 1
print(two_grams)

['heat wave', 'refugio oil', 'oil spill', 'spill may', 'may costly', 'costly big', 'big projected', 'burning buildings', 'severe thunderstorm', '70 years']


### Logistic Regression L1 (N-gram)

In [61]:
logmodel_ngram = LogisticRegression(penalty='l1', solver='liblinear')
logmodel_ngram.fit(X_train_vocab_ngram, y_train)

LogisticRegression(penalty='l1', solver='liblinear')

In [62]:
predictions_ngram = logmodel_ngram.predict(X_dev_vocab_ngram)

In [63]:
print(classification_report(y_dev, predictions_ngram))
print(f1_score(y_dev, predictions_ngram))

              precision    recall  f1-score   support

           0       0.79      0.87      0.82      1330
           1       0.78      0.67      0.72       954

    accuracy                           0.79      2284
   macro avg       0.78      0.77      0.77      2284
weighted avg       0.79      0.79      0.78      2284

0.7240990990990991


In [64]:
predictions_ngram_train = logmodel_ngram.predict(X_train_vocab_ngram)

In [65]:
print(classification_report(y_train, predictions_ngram_train))
print(f1_score(y_train, predictions_ngram_train))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      3012
           1       0.88      0.75      0.81      2317

    accuracy                           0.84      5329
   macro avg       0.85      0.83      0.84      5329
weighted avg       0.85      0.84      0.84      5329

0.8069110436609853


### Naive Bayes (N-gram)

In [66]:
# BNB Implementation 
n_ngram = X_train_vocab_ngram.shape[0] # size of the dataset
d_ngram = X_train_vocab_ngram.shape[1] # number of features in our dataset
K_ngram = 2 # number of clases
alpha_ngram = 1

# these are the shapes of the parameters
psis_ngram = np.zeros([K_ngram, d_ngram])
phis_ngram = np.zeros([K_ngram])

# we now compute the parameters
for k in range(K_ngram):
    X_k = X_train_vocab_ngram[y_train == k]
    #psis_ngram[k] = np.mean(X_k, axis=0)
    psis_ngram[k] = (np.sum(X_k, axis=0) + alpha_ngram) / (X_k.shape[0] + (2*alpha_ngram)) #laplace smoothing
    #psis_ngram[k] = ((np.mean(X_k, axis=0) * X_k.shape[0]) + alpha_ngram) / (X_k.shape[0] + (2*alpha_ngram))
    phis_ngram[k] = X_k.shape[0] / float(n_ngram)

# print out the class proportions
print(phis_ngram)
print(psis_ngram.shape)

[0.56520923 0.43479077]
(2, 1006)


In [67]:
predictions_nb_ngram, logpyx_ngram = nb_predictions(X_dev_vocab_ngram, psis_ngram, phis_ngram)

In [68]:
predictions_nb_train_ngram, logpyx_train_ngram = nb_predictions(X_train_vocab_ngram, psis_ngram, phis_ngram)

In [69]:
print(classification_report(y_dev, predictions_nb_ngram))
print(f1_score(y_dev, predictions_nb_ngram))

              precision    recall  f1-score   support

           0       0.76      0.91      0.83      1330
           1       0.82      0.61      0.70       954

    accuracy                           0.78      2284
   macro avg       0.79      0.76      0.76      2284
weighted avg       0.79      0.78      0.77      2284

0.6995785671282361


In [70]:
print(classification_report(y_train, predictions_nb_train_ngram))
print(f1_score(y_train, predictions_nb_train_ngram))

              precision    recall  f1-score   support

           0       0.77      0.94      0.85      3012
           1       0.89      0.64      0.74      2317

    accuracy                           0.81      5329
   macro avg       0.83      0.79      0.80      5329
weighted avg       0.82      0.81      0.80      5329

0.7428859229413247


In [71]:
# Print results for quick comparison and plotting

print(min_df_bow)

print(f1_score(y_dev, predictions))
print(f1_score(y_train, predictions_train))

print(f1_score(y_dev, predictions_l1))
print(f1_score(y_train, predictions_train_l1))

print(f1_score(y_dev, predictions_l2))
print(f1_score(y_train, predictions_train_l2))

print(f1_score(y_dev, predictions_nb))
print(f1_score(y_train, predictions_nb_train))


11
0.7123727905731119
0.8417040358744395
0.727580372250423
0.8122372723026623
0.7295036252091466
0.8198614318706697
0.7223502304147464
0.7679730704496273


In [72]:
# Print results for quick comparison and plotting

print(min_df_ngram)

print(f1_score(y_dev, predictions_ngram))
print(f1_score(y_train, predictions_ngram_train))

print(f1_score(y_dev, predictions_nb_ngram))
print(f1_score(y_train, predictions_nb_train_ngram))

12
0.7240990990990991
0.8069110436609853
0.6995785671282361
0.7428859229413247


# Final for Kaggle

### Preprocessing

In [73]:
X_train_final = train.drop(['target'], axis=1)
y_train_final = train['target']
X_test_final = test

In [74]:
# Re-do all pre-processing steps on final train set
X_train_final['text'] = X_train_final['text'].apply(lambda x: x.lower())
X_train_final['text'] = X_train_final['text'].apply(lambda x: re.sub(r'http\S+', '', x))
X_train_final['text'] = X_train_final['text'].str.replace('[{}]'.format(string.punctuation), '')
X_train_final['text'] = X_train_final['text'].str.split()
X_train_final['text'] = X_train_final['text'].apply(lambda x: [item for item in x if item not in stop])
X_train_final['text_stemmed'] = X_train_final['text'].apply(lambda x: [wnl.lemmatize(y) for y in x])
X_train_final['text_stemmed'] = X_train_final['text'].apply(lambda x: [wnl.lemmatize(y, pos='v') for y in x])
X_train_final['text_stemmed'] = X_train_final['text'].apply(lambda x: [wnl.lemmatize(y, pos='a') for y in x])
X_train_final['text_stemmed_str'] = X_train_final['text_stemmed'].apply(lambda x: " ".join(x))

In [75]:
# Re-do all pre-processing steps on final test set
X_test_final['text'] = X_test_final['text'].apply(lambda x: x.lower())
X_test_final['text'] = X_test_final['text'].apply(lambda x: re.sub(r'http\S+', '', x))
X_test_final['text'] = X_test_final['text'].str.replace('[{}]'.format(string.punctuation), '')
X_test_final['text'] = X_test_final['text'].str.split()
X_test_final['text'] = X_test_final['text'].apply(lambda x: [item for item in x if item not in stop])
X_test_final['text_stemmed'] = X_test_final['text'].apply(lambda x: [wnl.lemmatize(y) for y in x])
X_test_final['text_stemmed'] = X_test_final['text'].apply(lambda x: [wnl.lemmatize(y, pos='v') for y in x])
X_test_final['text_stemmed'] = X_test_final['text'].apply(lambda x: [wnl.lemmatize(y, pos='a') for y in x])
X_test_final['text_stemmed_str'] = X_test_final['text_stemmed'].apply(lambda x: " ".join(x))

### Bag of Words

In [76]:
min_df_bow_final = 11
count_vect_final = CountVectorizer(binary=True, min_df=min_df_bow_final, analyzer='word')
X_train_final_vocab = count_vect_final.fit_transform(X_train_final['text_stemmed_str']).toarray()
X_test_final_vocab = count_vect_final.transform(X_test_final['text_stemmed_str']).toarray()
print(X_train_final_vocab.shape)
print(X_test_final_vocab.shape)

(7613, 1294)
(3263, 1294)


### Logistic Regression (L1 Reg)

In [77]:
logmodel_final = LogisticRegression(penalty='l1', solver='liblinear')
logmodel_final.fit(X_train_final_vocab, y_train_final)

LogisticRegression(penalty='l1', solver='liblinear')

In [78]:
predictions_final = logmodel_final.predict(X_test_final_vocab)

In [79]:
predictions_final_train = logmodel_final.predict(X_train_final_vocab)

In [80]:
print(classification_report(y_train_final, predictions_final_train))
print(f1_score(y_train_final, predictions_final_train))

              precision    recall  f1-score   support

           0       0.84      0.92      0.87      4342
           1       0.87      0.76      0.81      3271

    accuracy                           0.85      7613
   macro avg       0.85      0.84      0.84      7613
weighted avg       0.85      0.85      0.85      7613

0.8128674069235793


In [81]:
test_ids = test['id']

In [82]:
kaggle_submit_lr = pd.concat([test_ids, pd.Series(predictions_final)], axis=1)
kaggle_submit_lr = kaggle_submit_lr.rename(columns={0: "target"})
kaggle_submit_lr

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [83]:
kaggle_submit_lr.to_csv('nlp_disaster_logreg_l1.csv', index=False)

### Naive Bayes

In [84]:
n_final = X_train_final_vocab.shape[0] # size of the dataset
d_final = X_train_final_vocab.shape[1] # number of features in our dataset
K_final = 2 # number of clases
alpha_final = 1

# these are the shapes of the parameters
psis_final = np.zeros([K_final, d_final])
phis_final = np.zeros([K_final])

# we now compute the parameters
for k in range(K_final):
    X_k = X_train_final_vocab[y_train_final == k]
    #psis_ngram[k] = np.mean(X_k, axis=0)
    psis_final[k] = (np.sum(X_k, axis=0) + alpha_final) / (X_k.shape[0] + (2*alpha_final)) #laplace smoothing
    #psis_ngram[k] = ((np.mean(X_k, axis=0) * X_k.shape[0]) + alpha_ngram) / (X_k.shape[0] + (2*alpha_ngram))
    phis_final[k] = X_k.shape[0] / float(n_final)

# print out the class proportions
print(phis_final)
print(psis_final.shape)

[0.57034021 0.42965979]
(2, 1294)


In [85]:
predictions_nb_final, logpyx_final = nb_predictions(X_test_final_vocab, psis_final, phis_final)

In [86]:
predictions_nb_train_final, logpyx_train_final = nb_predictions(X_train_final_vocab, psis_final, phis_final)

In [87]:
print(classification_report(y_train_final, predictions_nb_train_final))
print(f1_score(y_train_final, predictions_nb_train_final))

              precision    recall  f1-score   support

           0       0.80      0.92      0.85      4342
           1       0.87      0.69      0.77      3271

    accuracy                           0.82      7613
   macro avg       0.83      0.80      0.81      7613
weighted avg       0.83      0.82      0.82      7613

0.7663934426229508


In [88]:
kaggle_submit_nb = pd.concat([test_ids, pd.Series(predictions_nb_final)], axis=1)
kaggle_submit_nb = kaggle_submit_nb.rename(columns={0: "target"})
kaggle_submit_nb

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,0
3260,10868,1
3261,10874,1


In [89]:
kaggle_submit_nb.to_csv('nlp_disaster_naivebayes.csv', index=False)

### Logistic Regression (L1) using N-grams 

In [90]:
min_df_ngram_final = 12
count_vect_final_ngram = CountVectorizer(binary=True, min_df=min_df_ngram_final, ngram_range=(1,2))

In [91]:
X_train_vocab_ngram_final = count_vect_final_ngram.fit_transform(X_train_final['text_stemmed_str']).toarray()
X_test_vocab_ngram_final = count_vect_final_ngram.transform(X_test_final['text_stemmed_str']).toarray()
print(X_train_vocab_ngram_final.shape)
print(X_test_vocab_ngram_final.shape)

(7613, 1464)
(3263, 1464)


In [92]:
logmodel_ngram_final = LogisticRegression(penalty='l1', solver='liblinear')
logmodel_ngram_final.fit(X_train_vocab_ngram_final, y_train_final)

LogisticRegression(penalty='l1', solver='liblinear')

In [93]:
predictions_ngram_final = logmodel_ngram_final.predict(X_test_vocab_ngram_final)

In [94]:
predictions_ngram_train_final = logmodel_ngram_final.predict(X_train_vocab_ngram_final)

In [95]:
print(classification_report(y_train_final, predictions_ngram_train_final))
print(f1_score(y_train_final, predictions_ngram_train_final))

              precision    recall  f1-score   support

           0       0.84      0.92      0.87      4342
           1       0.87      0.76      0.81      3271

    accuracy                           0.85      7613
   macro avg       0.85      0.84      0.84      7613
weighted avg       0.85      0.85      0.85      7613

0.8130718954248367


In [96]:
kaggle_submit_ngram = pd.concat([test_ids, pd.Series(predictions_ngram_final)], axis=1)
kaggle_submit_ngram = kaggle_submit_ngram.rename(columns={0: "target"})
kaggle_submit_ngram

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [97]:
kaggle_submit_ngram.to_csv('nlp_disaster_logreg_l1_ngram.csv', index=False)