In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NLP/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/NLP


# 1. Binary Classification on Text Data

a. Download and read data

In [3]:
train_df = pd.read_csv("train.csv", low_memory=False)
test_df = pd.read_csv("test.csv", low_memory=False)
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [101]:
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [4]:
# Percentage of real disastes

train_df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [5]:
3271/(3271+4342)

0.4296597924602653

## b. Split the training *data*

In [6]:
from sklearn.model_selection import train_test_split

X = train_df.drop(columns=['target']).copy()
y = train_df['target']

X_train, X_dev, y_train, y_dev = train_test_split(X, y, train_size=0.7)

In [7]:
X_train

Unnamed: 0,id,keyword,location,text
4531,6443,injured,Florida,Experienced urogyn trying to help mesh injured...
7019,10060,typhoon,"Calgary, AB, Canada",Find out how your fund was used for Typhoon Ha...
7502,10731,wreck,Canada BC,@raineishida lol...Im just a nervous wreck :P
2402,3459,derailed,"Washington, DC",Happy no one was hurt when #wmata train derail...
413,599,arsonist,Atlanta,#NOWPLAYING Arsonist MC - So Impressed - @AR...
...,...,...,...,...
4136,5883,hailstorm,"Calgary, Alberta",Calgary Transit reviewing policy after leaving...
5392,7693,panic,,Panic attacks are the worst ????
1946,2797,curfew,IM LOST,Da Judge Gave Dis Girl 5pm Curfew ??????
115,165,aftershock,US,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/vA...


In [8]:
y_train

4531    1
7019    1
7502    0
2402    1
413     0
       ..
4136    1
5392    0
1946    0
115     0
6042    0
Name: target, Length: 5329, dtype: int64

## c. Preprocessing

1. Make lowercase

In [9]:
def make_lowercase(df, colname):
  df[colname] = df[colname].str.lower()

  return df

X_train = make_lowercase(X_train, 'text')
X_dev = make_lowercase(X_dev, 'text')
X_test = make_lowercase(test_df, 'text')
train_whole = make_lowercase(train_df, 'text')

X_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,keyword,location,text
4531,6443,injured,Florida,experienced urogyn trying to help mesh injured...
7019,10060,typhoon,"Calgary, AB, Canada",find out how your fund was used for typhoon ha...
7502,10731,wreck,Canada BC,@raineishida lol...im just a nervous wreck :p
2402,3459,derailed,"Washington, DC",happy no one was hurt when #wmata train derail...
413,599,arsonist,Atlanta,#nowplaying arsonist mc - so impressed - @ar...
...,...,...,...,...
4136,5883,hailstorm,"Calgary, Alberta",calgary transit reviewing policy after leaving...
5392,7693,panic,,panic attacks are the worst ????
1946,2797,curfew,IM LOST,da judge gave dis girl 5pm curfew ??????
115,165,aftershock,US,320 [ir] icemoon [aftershock] | http://t.co/va...


2. Remove punctuation

In [10]:
def remove_punc(df, column):
  df[column] = df[column].str.replace('[^\w\s]','')
  return df


X_train = remove_punc(X_train, 'text')
X_dev = remove_punc(X_dev, 'text')
X_test = remove_punc(X_test, 'text')
train_whole = remove_punc(train_whole, 'text')

X_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,keyword,location,text
4531,6443,injured,Florida,experienced urogyn trying to help mesh injured...
7019,10060,typhoon,"Calgary, AB, Canada",find out how your fund was used for typhoon ha...
7502,10731,wreck,Canada BC,raineishida lolim just a nervous wreck p
2402,3459,derailed,"Washington, DC",happy no one was hurt when wmata train deraile...
413,599,arsonist,Atlanta,nowplaying arsonist mc so impressed arsoni...
...,...,...,...,...
4136,5883,hailstorm,"Calgary, Alberta",calgary transit reviewing policy after leaving...
5392,7693,panic,,panic attacks are the worst
1946,2797,curfew,IM LOST,da judge gave dis girl 5pm curfew
115,165,aftershock,US,320 ir icemoon aftershock httptcovam5podgyw ...


3. Strip stop words (and, or, the, etc)

In [11]:
def strip_stop(df, column):
  stop_words = ["and", "or", "the", "just", "my", "a", "an", "mine", "also", "any", "are", "is", "be", "but", "each", "else", "if", "in", "it", "your", "yours", "their", "theirs"]
  df[column] = [' '.join([item for item in x.split() 
                  if item not in stop_words]) 
                  for x in df[column]]

  return df

X_train = strip_stop(X_train, 'text')
X_dev = strip_stop(X_dev, 'text')
X_test = strip_stop(X_test, 'text')
train_whole = strip_stop(train_whole, 'text')

X_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,id,keyword,location,text
4531,6443,injured,Florida,experienced urogyn trying to help mesh injured...
7019,10060,typhoon,"Calgary, AB, Canada",find out how fund was used for typhoon haiyan ...
7502,10731,wreck,Canada BC,raineishida lolim nervous wreck p
2402,3459,derailed,"Washington, DC",happy no one was hurt when wmata train deraile...
413,599,arsonist,Atlanta,nowplaying arsonist mc so impressed arsonistmu...
...,...,...,...,...
4136,5883,hailstorm,"Calgary, Alberta",calgary transit reviewing policy after leaving...
5392,7693,panic,,panic attacks worst
1946,2797,curfew,IM LOST,da judge gave dis girl 5pm curfew
115,165,aftershock,US,320 ir icemoon aftershock httptcovam5podgyw dj...


4. Lemmatise the tweets

In [12]:
!pip install -q wordcloud
import wordcloud

import nltk
nltk.download('wordnet')

w_tokeniser = nltk.tokenize.WhitespaceTokenizer()
lemmatiser = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatiser.lemmatize(w) for w in w_tokeniser.tokenize(text)]

X_train_lemmatised = X_train.copy()

X_train_lemmatised['text'] = X_train_lemmatised.text.apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [13]:
X_train_lemmatised

Unnamed: 0,id,keyword,location,text
4531,6443,injured,Florida,"[experienced, urogyn, trying, to, help, mesh, ..."
7019,10060,typhoon,"Calgary, AB, Canada","[find, out, how, fund, wa, used, for, typhoon,..."
7502,10731,wreck,Canada BC,"[raineishida, lolim, nervous, wreck, p]"
2402,3459,derailed,"Washington, DC","[happy, no, one, wa, hurt, when, wmata, train,..."
413,599,arsonist,Atlanta,"[nowplaying, arsonist, mc, so, impressed, arso..."
...,...,...,...,...
4136,5883,hailstorm,"Calgary, Alberta","[calgary, transit, reviewing, policy, after, l..."
5392,7693,panic,,"[panic, attack, worst]"
1946,2797,curfew,IM LOST,"[da, judge, gave, dis, girl, 5pm, curfew]"
115,165,aftershock,US,"[320, ir, icemoon, aftershock, httptcovam5podg..."


In [14]:
X_train

Unnamed: 0,id,keyword,location,text
4531,6443,injured,Florida,experienced urogyn trying to help mesh injured...
7019,10060,typhoon,"Calgary, AB, Canada",find out how fund was used for typhoon haiyan ...
7502,10731,wreck,Canada BC,raineishida lolim nervous wreck p
2402,3459,derailed,"Washington, DC",happy no one was hurt when wmata train deraile...
413,599,arsonist,Atlanta,nowplaying arsonist mc so impressed arsonistmu...
...,...,...,...,...
4136,5883,hailstorm,"Calgary, Alberta",calgary transit reviewing policy after leaving...
5392,7693,panic,,panic attacks worst
1946,2797,curfew,IM LOST,da judge gave dis girl 5pm curfew
115,165,aftershock,US,320 ir icemoon aftershock httptcovam5podgyw dj...


In [15]:
X_train.text

4531    experienced urogyn trying to help mesh injured...
7019    find out how fund was used for typhoon haiyan ...
7502                    raineishida lolim nervous wreck p
2402    happy no one was hurt when wmata train deraile...
413     nowplaying arsonist mc so impressed arsonistmu...
                              ...                        
4136    calgary transit reviewing policy after leaving...
5392                                  panic attacks worst
1946                    da judge gave dis girl 5pm curfew
115     320 ir icemoon aftershock httptcovam5podgyw dj...
6042    england east coast dogger bank westward 1 seis...
Name: text, Length: 5329, dtype: object

In [16]:
import collections

# Lemmatising helps us build a giant list of all the stripped words after preprocessing, and we can use this to count the occurences of 
# each word to make a sensible decision for M. (bag of words)
word_list = []

for element in X_train_lemmatised['text'].tolist():
  word_list = word_list + element


occurrences = collections.Counter(word_list)
occurrences


Counter({'experienced': 2,
         'urogyn': 1,
         'trying': 18,
         'to': 1349,
         'help': 59,
         'mesh': 1,
         'injured': 33,
         'woman': 66,
         'talk': 14,
         'worst': 15,
         'offender': 1,
         'httptconpoqlkqup9': 1,
         'meshnewsdesk': 1,
         'find': 22,
         'out': 197,
         'how': 123,
         'fund': 6,
         'wa': 261,
         'used': 20,
         'for': 605,
         'typhoon': 18,
         'haiyan': 2,
         'philippine': 8,
         'see': 84,
         'devpeace': 1,
         'relief': 6,
         'report': 40,
         'httptcojwxrx1lsqo': 1,
         'raineishida': 1,
         'lolim': 1,
         'nervous': 1,
         'wreck': 47,
         'p': 15,
         'happy': 16,
         'no': 183,
         'one': 136,
         'hurt': 8,
         'when': 163,
         'wmata': 5,
         'train': 67,
         'derailed': 19,
         'express': 10,
         'bus': 40,
         'so': 227,
     

In [19]:
from collections import Counter 

count = Counter(occurrences.values())
count


Counter({1: 11567,
         2: 1681,
         3: 776,
         4: 460,
         5: 305,
         6: 200,
         7: 155,
         8: 130,
         9: 104,
         10: 87,
         11: 74,
         12: 73,
         13: 65,
         14: 64,
         15: 54,
         16: 41,
         17: 37,
         18: 38,
         19: 41,
         20: 38,
         21: 32,
         22: 28,
         23: 27,
         24: 33,
         25: 23,
         26: 28,
         27: 27,
         28: 21,
         29: 21,
         30: 17,
         31: 13,
         32: 14,
         33: 8,
         34: 18,
         35: 10,
         36: 8,
         37: 12,
         38: 8,
         39: 8,
         40: 9,
         41: 7,
         42: 11,
         43: 4,
         44: 5,
         45: 6,
         46: 2,
         47: 6,
         48: 5,
         49: 3,
         50: 3,
         51: 4,
         52: 2,
         53: 1,
         54: 3,
         55: 5,
         57: 2,
         58: 3,
         59: 4,
         60: 3,
         61: 2,
 

In [20]:
occurrences_updated = {k:v for k,v in occurrences.items() if v != 1}
print(len(occurrences_updated.keys()))
print(len(occurrences.keys()))


4993
16560


## d. Bag of Words

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(binary=True, min_df=3)
X_train_counts = count_vect.fit_transform(X_train.text)
X_test_counts = count_vect.transform(X_test.text)

X_train_counts

<5329x3376 sparse matrix of type '<class 'numpy.int64'>'
	with 48283 stored elements in Compressed Sparse Row format>

In [22]:
X_train_counts.shape

(5329, 3376)

In [23]:
X_test_counts.shape

(3263, 3376)

In [24]:
count_vect.vocabulary_.get("this")

2941

In [25]:
y_train.shape

(5329,)

## e. Logistic Regression

### TRAINING SET

i. Log Reg without regularisation.

In [53]:
from sklearn.linear_model import LogisticRegression

# Create an instance of Softmax and fit the data.

logreg = LogisticRegression(penalty='none', C=1e5, multi_class='multinomial', verbose=True)
logreg.fit(X_train_counts, y_train)


#predict on the training set
X_train_predicted = logreg.predict(X_train_counts)


  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.3s finished


In [54]:
X_train_predicted

array([1, 1, 0, ..., 0, 0, 0])

In [55]:
# Calculating the F1 score

from sklearn.metrics import f1_score

f1_score(y_train, X_train_predicted, average='weighted')

0.9853538634598548

In [57]:
count_vect.vocabulary_

{'trying': 3059,
 'to': 2978,
 'help': 1367,
 'injured': 1507,
 'women': 3288,
 'worst': 3307,
 'find': 1123,
 'out': 2106,
 'how': 1433,
 'fund': 1212,
 'was': 3200,
 'used': 3122,
 'for': 1165,
 'typhoon': 3084,
 'philippines': 2177,
 'see': 2562,
 'relief': 2396,
 'funds': 1213,
 'report': 2409,
 'wreck': 3317,
 'happy': 1326,
 'no': 2013,
 'one': 2080,
 'when': 3248,
 'wmata': 3283,
 'train': 3021,
 'derailed': 821,
 'express': 1044,
 'bus': 477,
 'so': 2689,
 'much': 1949,
 'better': 355,
 'than': 2916,
 'metro': 1860,
 'rail': 2332,
 'nowplaying': 2027,
 'arsonist': 242,
 'suicide': 2827,
 'bomber': 414,
 'kills': 1609,
 '15': 18,
 'saudi': 2527,
 'security': 2561,
 'site': 2656,
 'mosque': 1928,
 'itûªs': 1558,
 'time': 2971,
 'do': 874,
 'away': 285,
 'with': 3279,
 'cloud': 626,
 'helping': 1368,
 'water': 3208,
 'wind': 3267,
 'came': 503,
 'through': 2957,
 'probably': 2278,
 'some': 2700,
 'outflow': 2109,
 'havent': 1342,
 'heard': 1355,
 'thunder': 2961,
 'yet': 3344,
 'f

In [63]:
weights = logreg.coef_[0]
max_coef = max(weights)
max_index = np.where(weights == max_coef)
max_index


(array([1391]),)

In [69]:
vocab = count_vect.vocabulary_

keys = [k for k, v in vocab.items() if v == 1391]
print(keys)


['hiroshima']


ii. Log reg with L1 regularisation.

In [70]:
logreg = LogisticRegression(penalty='l1', solver='liblinear', 
                            max_iter=int(1e6),
                            warm_start=True,
                            intercept_scaling=10000.)
logreg.fit(X_train_counts, y_train)

#predict on test set
X_train_predicted_l1 = logreg.predict(X_train_counts)
X_train_predicted_l1

array([1, 1, 0, ..., 0, 0, 0])

In [71]:
f1_score(y_train, X_train_predicted_l1, average='weighted')

0.8809911595373776

In [72]:
weights = logreg.coef_[0]
max_coef = max(weights)
max_index = np.where(weights == max_coef)
max_index


(array([1391]),)

iii. Log reg with L2 Regularisation

In [41]:
logreg = LogisticRegression(penalty='l2', C=1e5, multi_class='multinomial', verbose=True)
logreg.fit(X_train_counts, y_train)

#predict on the training set
X_train_predicted_l2 = logreg.predict(X_train_counts)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


In [42]:
f1_score(y_train, X_train_predicted_l2, average='weighted')

0.9853440846977901

### DEVELOPMENT SET

Performing and testing all the models on the development set.

i. Log Reg without L2 regularisation


In [75]:
# First, creating bag of words on the development set.
count_vect = CountVectorizer(binary=True, min_df=3)
X_dev_counts = count_vect.fit_transform(X_dev.text)
X_test_counts = count_vect.transform(X_test.text)

# Create an instance of Softmax and fit the data.
logreg = LogisticRegression(penalty='none', C=1e5, multi_class='multinomial', verbose=True)
logreg.fit(X_dev_counts, y_dev)


#predict on the training set
X_dev_predicted = logreg.predict(X_dev_counts)

  "Setting penalty='none' will ignore the C and l1_ratio "
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


In [76]:
f1_score(y_dev, X_dev_predicted, average='weighted')

0.9855491360487559

ii. L1 regularisation

In [77]:
logreg = LogisticRegression(penalty='l1', solver='liblinear', 
                            max_iter=int(1e6),
                            warm_start=True,
                            intercept_scaling=10000.)
logreg.fit(X_dev_counts, y_dev)

#predict on test set
X_dev_predicted_l1 = logreg.predict(X_dev_counts)
X_dev_predicted_l1

array([0, 1, 0, ..., 0, 1, 1])

In [78]:
f1_score(y_dev, X_dev_predicted_l1, average='weighted')

0.8887577216499724

In [79]:
weights = logreg.coef_[0]
max_coef = max(weights)
max_index = np.where(weights == max_coef)
max_index


(array([1739]),)

In [80]:
vocab = count_vect.vocabulary_

keys = [k for k, v in vocab.items() if v == 1391]
print(keys)

['set']


iii. Log reg with L2 Regularisation

In [46]:
logreg = LogisticRegression(penalty='l2', C=1e5, multi_class='multinomial', verbose=True)
logreg.fit(X_dev_counts, y_dev)

#predict on the training set
X_dev_predicted_l2 = logreg.predict(X_dev_counts)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


In [47]:
f1_score(y_dev, X_dev_predicted_l2, average='weighted')

0.9855421385321214

iv. Among all three classifiers, 

v. For L1 regularisation, the weight vector is 

In [40]:
X_train

Unnamed: 0,id,keyword,location,text
5397,7699,panicking,?^åá??åá?^?? ??,okay i cant find so im kinda panicking
7382,10565,windstorm,,you find patio table umbrella chairs flipped o...
4979,7104,military,canada,senator alarmed by reports us military familie...
5207,7437,obliterated,Upstairs.,theevilolives its closest structure to hypo ce...
1394,2012,casualties,,another movie theater attackclose to home this...
...,...,...,...,...
6829,9780,trapped,876 Jamrock.,literally trapped room cuz bathroom being remo...
3887,5526,flattened,"new york, ny",who said this yosemite sam drumpf ûïnobody ûil...
2998,4306,dust%20storm,,kids disappear dust storm atmospheric aussie t...
7133,10217,volcano,,eruption of indonesian volcano sparks transpor...


f. Bernoulli Naive Bayes

In [81]:
count_vect = CountVectorizer(binary=True, min_df=3)
X_train_counts = count_vect.fit_transform(X_train.text).toarray()

X_train_counts.shape

(5329, 3376)

In [82]:
X_dev_counts = count_vect.transform(X_dev.text).toarray()
X_dev_counts

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [83]:
n = X_train_counts.shape[0] # size of the dataset
d = X_train_counts.shape[1] # number of features in our dataset
K = 2 # number of clases

# these are the shapes of the parameters
psis = np.zeros([K,d])
phis = np.zeros([K])

# we now compute the parameters
for k in range(K):
    X_k = X_train_counts[y_train.to_numpy() == k]
    psis[k] = np.mean(X_k, axis=0)
    phis[k] = (X_k.shape[0] + 1) / (float(n) +2)

# print out the class proportions
print(phis)


[0.57268805 0.42731195]


In [85]:
def nb_predictions(x, psis, phis):
    """This returns class assignments and scores under the NB model.
    
    We compute \arg\max_y p(y|x) as \arg\max_y p(x|y)p(y)
    """
    # adjust shapes
    n, d = x.shape
    x = np.reshape(x, (1, n, d))
    psis = np.reshape(psis, (K, 1, d))
    
    # clip probabilities to avoid log(0)
    psis = psis.clip(1e-14, 1-1e-14)
    
    # compute log-probabilities
    logpy = np.log(phis).reshape([K,1])
    logpxy = x * np.log(psis) + (1-x) * np.log(1-psis)
    logpyx = logpxy.sum(axis=2) + logpy

    return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K,n])

idx, logpyx = nb_predictions(X_dev_counts, psis, phis)
print(idx[:10])

[0 1 0 1 1 1 1 0 1 0]


In [88]:
f1_score(y_dev, idx, average='weighted')

0.7865501258877472

In [86]:
psis.shape

(2, 3376)

In [87]:
X_dev_counts.shape

(2284, 3376)

h. N-gram Model


In [99]:
# N = 2

count_vect = CountVectorizer(binary=True, min_df=3, ngram_range=(1,2))
X_train_counts = count_vect.fit_transform(X_train.text).toarray()
X_dev_counts = count_vect.fit_transform(X_dev.text).toarray()

X_train_counts.shape

(5329, 5932)

Log Reg with L2 regularisation on 1 and 2 Grams


In [100]:
logreg = LogisticRegression(penalty='l2', C=1e5, multi_class='multinomial', verbose=True)
logreg.fit(X_train_counts, y_train)

#predict on the training set
X_train_predicted_l2 = logreg.predict(X_train_counts)

#predict on the development set
logreg.fit(X_dev_counts, y_dev)
X_dev_predicted_l2 = logreg.predict(X_dev_counts)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.8s finished


In [91]:
X_train_predicted_l2

array([1, 1, 0, ..., 0, 0, 0])

In [101]:
X_dev_predicted_l2

array([0, 1, 0, ..., 0, 1, 1])

In [102]:
print(f1_score(y_train, X_train_predicted_l2, average='weighted'))
print(f1_score(y_dev, X_dev_predicted_l2, average='weighted'))

0.9859133921702686
0.9855456840724779


Bernoulli Naive Bayes

In [105]:
# Training Set

n = X_train_counts.shape[0] # size of the dataset
d = X_train_counts.shape[1] # number of features in our dataset
K = 2 # number of clases

# these are the shapes of the parameters
psis = np.zeros([K,d])
phis = np.zeros([K])

# we now compute the parameters
for k in range(K):
    X_k = X_train_counts[y_train.to_numpy() == k]
    psis[k] = np.mean(X_k, axis=0)
    phis[k] = (X_k.shape[0] + 1) / (float(n) +2)


idx, logpyx = nb_predictions(X_train_counts, psis, phis)
print(idx[:10])

[0 1 0 1 0 1 0 0 1 0]


In [106]:
f1_score(y_train, idx, average='weighted')

0.8770717213823563

In [107]:
# Development Set


n = X_dev_counts.shape[0] # size of the dataset
d = X_dev_counts.shape[1] # number of features in our dataset
K = 2 # number of clases

# these are the shapes of the parameters
psis = np.zeros([K,d])
phis = np.zeros([K])

# we now compute the parameters
for k in range(K):
    X_k = X_dev_counts[y_dev.to_numpy() == k]
    psis[k] = np.mean(X_k, axis=0)
    phis[k] = (X_k.shape[0] + 1) / (float(n) +2)


idx, logpyx = nb_predictions(X_dev_counts, psis, phis)
print(idx[:10])

[0 1 0 1 1 1 0 1 1 0]


In [108]:
f1_score(y_dev, idx, average='weighted')

0.8756319030079861

In [95]:
features = count_vect.get_feature_names()
features

['05',
 '06',
 '10',
 '100',
 '1000',
 '11',
 '12',
 '1200',
 '12000',
 '12000 nigerian',
 '15',
 '15 saudi',
 '16',
 '16yr',
 '16yr old',
 '17',
 '18',
 '19',
 '1945',
 '1980',
 '1st',
 '20',
 '2013',
 '2015',
 '24',
 '25',
 '26',
 '2nd',
 '2us',
 '2us cable',
 '30',
 '30 fires',
 '31',
 '31 md',
 '32',
 '33',
 '370',
 '3d',
 '3g',
 '3g this',
 '40',
 '40 families',
 '4x4',
 '50',
 '500',
 '5km',
 '5km of',
 '60',
 '600',
 '70',
 '70 years',
 '70th',
 '70th anniversary',
 '731',
 '731 of',
 '911',
 '97georgia',
 '97georgia ave',
 'aba',
 'aba as',
 'abandoned',
 'abc',
 'abc news',
 'ablaze',
 'about',
 'about how',
 'about to',
 'about trapped',
 'absolutely',
 'abstorm',
 'abuse',
 'access',
 'access to',
 'accident',
 'account',
 'across',
 'act',
 'action',
 'actions',
 'actual',
 'actually',
 'added',
 'added video',
 'address',
 'advance',
 'advisory',
 'affected',
 'affected by',
 'afghan',
 'afghanistan',
 'after',
 'after atomic',
 'after boat',
 'after copilot',
 'after disn

In [96]:
two_grams = [x for x in features if len(x.split()) == 2]
two_grams[:10]


['12000 nigerian',
 '15 saudi',
 '16yr old',
 '2us cable',
 '30 fires',
 '31 md',
 '3g this',
 '40 families',
 '5km of',
 '70 years']

i. Report Results

In [109]:
count_vect = CountVectorizer(binary=True, min_df=3)
train_whole_counts = count_vect.fit_transform(train_whole.text)
X_test_counts = count_vect.transform(X_test.text)

In [110]:
logreg = LogisticRegression(penalty='l2', C=1e5, multi_class='multinomial', verbose=True)
logreg.fit(train_whole_counts, y)

#predict on the test set
test_whole_predicted_l2 = logreg.predict(X_test_counts)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished


In [111]:
test_whole_predicted_l2

array([1, 1, 1, ..., 1, 1, 1])

In [204]:
id = test_df['id']
results = pd.DataFrame({"id":id, "target": test_whole_predicted_l2})
results.to_csv("NLP_results.csv", index=False)