# Logistic Regression and Naive Bayes over Count Vectorizer

In [47]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

def flatten(t):
    return [item for sublist in t for item in sublist]

def load_dfs(test_size=0.1, shuffle=False, verbose=True):
    df = pd.read_csv("data/train.csv")
    df = df.drop(['keyword', 'location'], axis=1)
    
    df_test = pd.read_csv("data/test.csv")
    df_test = df_test.drop(['keyword', 'location'], axis=1)
    
    df_sub = pd.read_csv("data/sample_submission.csv")
    
    df_train, df_val = train_test_split(df, test_size=test_size, shuffle=shuffle)

    if verbose:
        print(f"train shape: {df_train.shape}")
        print(f"val shape  : {df_val.shape}")
        print(f"test shape : {df_test.shape}")
    return df_train, df_val, df_test, df_sub


def count_vect_pipeline(df_train, df_val, df_test, 
                        vectorizer, model_fn,
                        create_submission=False):
    X_train = vectorizer.fit_transform(df_train['text'])
    df_feature_train = pd.DataFrame(X_train.todense(), columns=vectorizer.get_feature_names())
    
    X_val = vectorizer.transform(df_val['text'])
    df_feature_val = pd.DataFrame(X_val.todense(), columns=vectorizer.get_feature_names())
    
    X_test = vectorizer.transform(df_test['text'])
    df_feature_test = pd.DataFrame(X_test.todense(), columns=vectorizer.get_feature_names())
    
    
    y_pred_train = model_fn(df_feature_train)
    y_pred_val = model_fn(df_feature_val)
    y_pred_test = model_fn(df_feature_test)
    
    train_acc = accuracy_score(df_train['target'], y_pred_train)
    val_acc = accuracy_score(df_val['target'], y_pred_val)
    print(f"Train accuracy: {train_acc:.3f}")
    print(f"Val accuracy  : {val_acc:.3f}")
    
    df_test['target'] = y_pred_test
    if create_submission is not False:
        df_test[['id', 'target']].to_csv(f"{create_submission}.csv", index=False)
        
    return df_test[['id', 'target']]


In [20]:
df_train, df_val, df_test, df_sub = load_dfs()

train shape: (6851, 3)
val shape  : (762, 3)
test shape : (3263, 2)


In [32]:
#!pip install nltk
from nltk.corpus import stopwords

In [41]:
def get_vocab(df, drop_stopwords=True, keep_only_alpha=True):
    l = flatten(df['text'].str.lower().str.split().tolist())
    if drop_stopwords:
        st = stopwords.words('english')
        l = [w for w in l if w not in st and (not keep_only_alpha or w.isalpha())]
    return Counter(l)
    

In [54]:
V_disaster = get_vocab(df_train[df_train['target'] == 1])
V_disaster.most_common(50)

[('fire', 138),
 ('via', 110),
 ('suicide', 100),
 ('people', 87),
 ('like', 80),
 ('killed', 75),
 ('police', 70),
 ('california', 69),
 ('two', 67),
 ('disaster', 67),
 ('train', 66),
 ('emergency', 65),
 ('crash', 65),
 ('bombing', 63),
 ('bomb', 63),
 ('get', 62),
 ('families', 61),
 ('buildings', 61),
 ('burning', 60),
 ('news', 58),
 ('bomber', 57),
 ('atomic', 56),
 ('hiroshima', 56),
 ('still', 53),
 ('fatal', 53),
 ('one', 52),
 ('nuclear', 52),
 ('accident', 49),
 ('new', 49),
 ('years', 48),
 ('debris', 48),
 ('storm', 48),
 ('homes', 48),
 ('may', 47),
 ('watch', 47),
 ('attack', 46),
 ('northern', 46),
 ('collapse', 46),
 ('mass', 46),
 ('first', 45),
 ('forest', 44),
 ('near', 44),
 ('car', 44),
 ('dead', 44),
 ('war', 44),
 ('severe', 43),
 ('oil', 43),
 ('fires', 42),
 ('man', 40),
 ('army', 40)]

# LB: 65%

In [57]:
# LB score: 0.65124
disaster_words = ['pain', 'trauma', 'tornado', 'crash', 
                  'hurricane', 'flood', 'dead', 'death', 'fire', 'forest',
                  'suicide', 'killed', 'police', 'disaster', 'train', 'emergency', 'bombing', 'bomb',
                  'families', 'burning', 'news', 'bomber', 'atomic', 'hiroshima', 'fatal', 'nuclear',
                  'accident', 'storm', 'homes', 'attack', 'car', 'war', 'severe', 'oil', 'fires', 'army']

def model(sample):
    for feature in disaster_words:
        if sample[feature] != 0:
            return 1
    return 0


def handcrafted_model(df):
    y_pred = df.apply(model, axis=1)
    return y_pred



vect = CountVectorizer(vocabulary=disaster_words)
df_sub1 = count_vect_pipeline(df_train, df_val, df_test, vect, handcrafted_model, 
                              create_submission='handcrafted-3')

Train accuracy: 0.684
Val accuracy  : 0.707


In [50]:
V_normal = get_vocab(df_train[df_train['target'] == 0])
V_normal.most_common()

[('like', 211),
 ('get', 149),
 ('new', 143),
 ('body', 105),
 ('one', 103),
 ('via', 88),
 ('would', 86),
 ('full', 81),
 ('emergency', 75),
 ('got', 74),
 ('love', 72),
 ('people', 72),
 ('know', 70),
 ('see', 70),
 ('video', 69),
 ('going', 67),
 ('time', 64),
 ('back', 64),
 ('want', 61),
 ('go', 56),
 ('think', 56),
 ('fire', 56),
 ('first', 54),
 ('still', 54),
 ('day', 54),
 ('u', 54),
 ('last', 53),
 ('make', 51),
 ('burning', 51),
 ('us', 50),
 ('need', 50),
 ('really', 49),
 ('let', 49),
 ('man', 48),
 ('good', 48),
 ('many', 47),
 ('even', 45),
 ('take', 45),
 ('world', 44),
 ('lol', 44),
 ('way', 43),
 ('feel', 43),
 ('fear', 43),
 ('say', 42),
 ('work', 42),
 ('cross', 42),
 ('every', 40),
 ('never', 40),
 ('life', 39),
 ('read', 39),
 ('help', 39),
 ('im', 38),
 ('content', 37),
 ('much', 36),
 ('check', 36),
 ('top', 36),
 ('could', 36),
 ('may', 36),
 ('ruin', 36),
 ('screaming', 35),
 ('bloody', 35),
 ('great', 35),
 ('right', 35),
 ('bag', 35),
 ('look', 34),
 ('anoth

In [53]:
V = get_vocab(df_train)
V.most_common(100)

[('like', 291),
 ('get', 211),
 ('via', 198),
 ('fire', 194),
 ('new', 192),
 ('people', 159),
 ('one', 155),
 ('emergency', 140),
 ('body', 117),
 ('would', 116),
 ('burning', 111),
 ('still', 107),
 ('suicide', 106),
 ('police', 101),
 ('got', 100),
 ('first', 99),
 ('video', 95),
 ('know', 94),
 ('going', 93),
 ('back', 92),
 ('disaster', 92),
 ('two', 91),
 ('see', 90),
 ('full', 89),
 ('buildings', 89),
 ('man', 88),
 ('bomb', 88),
 ('time', 87),
 ('us', 84),
 ('crash', 84),
 ('may', 83),
 ('love', 81),
 ('go', 80),
 ('last', 78),
 ('many', 78),
 ('nuclear', 78),
 ('killed', 77),
 ('day', 76),
 ('say', 75),
 ('think', 75),
 ('want', 74),
 ('news', 74),
 ('car', 73),
 ('train', 73),
 ('california', 72),
 ('could', 72),
 ('watch', 71),
 ('u', 71),
 ('attack', 70),
 ('years', 69),
 ('army', 69),
 ('world', 69),
 ('storm', 69),
 ('rt', 68),
 ('mass', 68),
 ('collapse', 67),
 ('dead', 66),
 ('make', 65),
 ('good', 65),
 ('bombing', 65),
 ('families', 65),
 ('really', 64),
 ('need', 64)

In [62]:
from sklearn.linear_model import LogisticRegression, LinearRegression

In [61]:
n_words = 200
top_n_words = [w for w, c in V.most_common(n_words)]
top_n_words

['like',
 'get',
 'via',
 'fire',
 'new',
 'people',
 'one',
 'emergency',
 'body',
 'would',
 'burning',
 'still',
 'suicide',
 'police',
 'got',
 'first',
 'video',
 'know',
 'going',
 'back',
 'disaster',
 'two',
 'see',
 'full',
 'buildings',
 'man',
 'bomb',
 'time',
 'us',
 'crash',
 'may',
 'love',
 'go',
 'last',
 'many',
 'nuclear',
 'killed',
 'day',
 'say',
 'think',
 'want',
 'news',
 'car',
 'train',
 'california',
 'could',
 'watch',
 'u',
 'attack',
 'years',
 'army',
 'world',
 'storm',
 'rt',
 'mass',
 'collapse',
 'dead',
 'make',
 'good',
 'bombing',
 'families',
 'really',
 'need',
 'accident',
 'another',
 'take',
 'fatal',
 'even',
 'way',
 'school',
 'look',
 'home',
 'work',
 'war',
 'atomic',
 'let',
 'bomber',
 'hiroshima',
 'life',
 'help',
 'never',
 'today',
 'every',
 'death',
 'city',
 'right',
 'fear',
 'forest',
 'fires',
 'read',
 'said',
 'old',
 'homes',
 'near',
 'please',
 'feel',
 'much',
 'oil',
 'getting',
 'im',
 'debris',
 'lol',
 'top',
 'tru

In [None]:
vect_top_n = CountVectorizer(vocabulary=top_n_words)

In [63]:
m = LinearRegression()

In [None]:
(1, 2)
(2, 4)
(3, 6)
(4, 8)

In [64]:
m.fit(X=[[1], [2], [3], [4]], y=[2, 4, 6, 8])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [65]:
m.predict([[5]])

array([10.])

In [66]:
m.coef_
y = coef_ * x + intercept_

array([2.])

In [68]:
m.intercept_

-1.7763568394002505e-15

In [69]:
m2 = LinearRegression()

m2.fit(X=[[2], [3], [4], [5]], y=[2, 4, 6, 8])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [71]:
m2.coef_

array([2.])

In [70]:
m2.intercept_

-2.0000000000000036

In [74]:
m2.coef_* 2  + m2.intercept_

array([2.])

In [73]:
m2.coef_* 6  + m2.intercept_

array([10.])

In [72]:
m2.predict([[6]])

array([10.])

In [76]:
def count_vect_pipeline_v2(df_train, df_val, df_test, 
                        vectorizer, model,
                        create_submission=False):
    X_train = vectorizer.fit_transform(df_train['text'])
    df_feature_train = pd.DataFrame(X_train.todense(), columns=vectorizer.get_feature_names())
    
    X_val = vectorizer.transform(df_val['text'])
    df_feature_val = pd.DataFrame(X_val.todense(), columns=vectorizer.get_feature_names())
    
    X_test = vectorizer.transform(df_test['text'])
    df_feature_test = pd.DataFrame(X_test.todense(), columns=vectorizer.get_feature_names())
    
    model.fit(df_feature_train, df_train['target'])
    y_pred_train = model.predict(df_feature_train)
    y_pred_val = model.predict(df_feature_val)
    y_pred_test = model.predict(df_feature_test)
    
    
    train_acc = accuracy_score(df_train['target'], y_pred_train)
    val_acc = accuracy_score(df_val['target'], y_pred_val)
    print(f"Train accuracy: {train_acc:.3f}")
    print(f"Val accuracy  : {val_acc:.3f}")
    
    df_test['target'] = y_pred_test
    if create_submission is not False:
        df_test[['id', 'target']].to_csv(f"{create_submission}.csv", index=False)
        
    return model, df_feature_train

In [78]:
vect_top_n = CountVectorizer(vocabulary=top_n_words)
m3 = LogisticRegression()
m3, df_feature_train = count_vect_pipeline_v2(df_train, df_val, df_test, vect_top_n, m3, 'logistic_over_200_top_words')

Train accuracy: 0.754
Val accuracy  : 0.718


In [81]:
df_feature_train.head()

Unnamed: 0,like,get,via,fire,new,people,one,emergency,body,would,...,dust,times,blood,whole,survivors,destruction,destroyed,real,keep,put
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
cw = pd.DataFrame(zip(df_feature_train.columns, m3.coef_[0]), columns=['feature', 'weight'])

In [87]:
cw.sort_values("weight").head(10)

Unnamed: 0,feature,weight
23,full,-1.827851
179,ruin,-1.645006
31,love,-1.276482
199,put,-1.135401
151,fucking,-1.118411
8,body,-1.033189
145,god,-1.014095
75,let,-0.983474
101,lol,-0.964643
136,check,-0.956705


In [88]:
cw.sort_values("weight").tail(10)

Unnamed: 0,feature,weight
93,near,1.803395
128,severe,1.850278
36,killed,1.961608
44,california,1.964473
178,outbreak,2.074993
141,spill,2.174942
43,train,2.395358
165,migrants,2.715518
77,hiroshima,2.866165
100,debris,2.890948


In [89]:
n_words = 1000
top_n_words = [w for w, c in V.most_common(n_words)]
top_n_words
vect_top_n = CountVectorizer(vocabulary=top_n_words)
m1000 = LogisticRegression()
m1000, df_feature_train = count_vect_pipeline_v2(df_train, df_val, df_test, vect_top_n, m1000, f'logistic_over_1000_top_words')

Train accuracy: 0.846
Val accuracy  : 0.769


In [92]:
n_words = 5000
top_n_words = [w for w, c in V.most_common(n_words)]
top_n_words
vect_top_n = CountVectorizer(vocabulary=top_n_words)
m = LogisticRegression()
m, df_feature_train = count_vect_pipeline_v2(df_train, df_val, df_test, vect_top_n, m, f'logistic_over_{n_words}_top_words')

Train accuracy: 0.921
Val accuracy  : 0.776


In [93]:
cw = pd.DataFrame(zip(df_feature_train.columns, m.coef_[0]), columns=['feature', 'weight'])
cw.sort_values("weight").tail(10)

Unnamed: 0,feature,weight
454,drought,1.807997
52,storm,1.901454
257,massacre,1.936732
129,floods,1.961371
100,debris,1.987077
835,hailstorm,2.010973
602,derailment,2.057129
59,bombing,2.099211
560,earthquake,2.431265
77,hiroshima,2.832892


In [94]:
cw.sort_values("weight").head(10)

Unnamed: 0,feature,weight
23,full,-1.371386
307,ebay,-1.358745
318,ass,-1.354927
4295,poll,-1.313449
475,song,-1.271447
291,bags,-1.244992
614,eyes,-1.226552
179,ruin,-1.221095
228,better,-1.203329
795,join,-1.162025


In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [97]:
n_words = 5000
top_n_words = [w for w, c in V.most_common(n_words)]
top_n_words
vect_top_n = TfidfVectorizer(vocabulary=top_n_words)
m = LogisticRegression()
m, df_feature_train = count_vect_pipeline_v2(df_train, df_val, df_test, vect_top_n, m, f'logistic_over_{n_words}_top_words_tfidf')

Train accuracy: 0.870
Val accuracy  : 0.795


In [100]:
from sklearn.naive_bayes import MultinomialNB

$P( disaster \mid \text{this tweet})$

https://en.wikipedia.org/wiki/Bayes%27_theorem

$P(A\mid B)=\frac {P(B\mid A) \cdot P(A)}{P(B)}$


$P( disaster \mid \text{this tweet}) =\frac {P(\text{this tweet}\mid disaster) \cdot P(disaster)}{P(\text{this tweet})}$



0    0.574369
1    0.425631
Name: target, dtype: float64

$P( disaster \mid \text{this tweet})  \varpropto P(\text{this tweet}\mid disaster) \cdot P(disaster)$

$P( non disaster \mid \text{this tweet})  \varpropto P(\text{this tweet}\mid non disaster) \cdot P(non disaster)$

In [101]:
n_words = 5000
top_n_words = [w for w, c in V.most_common(n_words)]
top_n_words
vect_top_n = CountVectorizer(vocabulary=top_n_words)
m = MultinomialNB()
m, df_feature_train = count_vect_pipeline_v2(df_train, df_val, df_test, vect_top_n, m, f'nb_{n_words}_top_words')

Train accuracy: 0.861
Val accuracy  : 0.770


In [109]:
import numpy as np

display(df_train['target'].value_counts(normalize=True))

np.exp(m.class_log_prior_)

0    0.574369
1    0.425631
Name: target, dtype: float64

array([0.57436871, 0.42563129])

$P( disaster \mid \text{this tweet})  \varpropto P(\text{this tweet}\mid disaster) \cdot P(disaster)$

$P( non disaster \mid \text{this tweet})  \varpropto P(\text{this tweet}\mid non disaster) \cdot P(non disaster)$

$P(\text{this tweet}\mid disaster) = P(\text{tweet words }\mid disaster)$ 

P(w|disaster)

In [119]:
np.exp(m.coef_).max()

0.07015357777016296

In [115]:
# m.coef_.min()

-10.376642461280849

In [116]:
#m.coef_.max()

-2.657068472021267

In [126]:
wp = pd.DataFrame(zip(top_n_words, np.exp(m.coef_[0])), columns=['word', 'proba'])

In [128]:
wp.sort_values("proba").head(10)

Unnamed: 0,word,proba
2499,thinks,3.1e-05
3154,applications,3.1e-05
3151,volga,3.1e-05
3150,enter,3.1e-05
3148,devalue,3.1e-05
3146,mcilroy,3.1e-05
3143,tastes,3.1e-05
3142,headset,3.1e-05
3141,bluedio,3.1e-05
3130,password,3.1e-05


In [129]:
wp.sort_values("proba").tail(10)

Unnamed: 0,word,proba
77,hiroshima,0.00271
44,california,0.002804
20,disaster,0.002866
5,people,0.003084
12,suicide,0.003427
2,via,0.003582
41,news,0.003956
3,fire,0.005171
4801,http,0.066073
2177,co,0.070154
