In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')

In [71]:
test = pd.read_csv('test.csv')

In [51]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
len(train)

7613

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
id          3263 non-null int64
keyword     3237 non-null object
location    2158 non-null object
text        3263 non-null object
dtypes: int64(1), object(3)
memory usage: 102.0+ KB


In [8]:
train['location'].fillna('None',inplace = True)
train['keyword'].fillna('None',inplace = True)

In [72]:
test['location'].fillna('None',inplace = True)
test['keyword'].fillna('None',inplace = True)

In [10]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [138]:
import spacy

In [139]:
nlp = spacy.load('en_core_web_md')

OSError: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

# Better way to create vectors?

In [15]:
vectors = np.array([nlp(tweet.text).vector for idx, tweet in train.iterrows()])

In [17]:
from sklearn.model_selection import train_test_split

In [19]:
vectors

array([[ 0.13452284,  0.23726209, -0.11959665, ...,  1.0300049 ,
         2.0431578 ,  1.0130912 ],
       [ 1.0309796 , -1.241845  , -0.70834446, ...,  2.0340476 ,
         1.3641878 , -0.49217844],
       [-0.07072449,  1.0874252 , -1.0262684 , ...,  0.14652216,
         0.01183708,  0.8379208 ],
       ...,
       [ 0.81655306, -0.525268  ,  0.08102124, ...,  0.07882321,
         0.07872479,  1.1387758 ],
       [ 1.3380736 , -0.34899577,  0.02064458, ..., -0.07607778,
         0.99538714,  1.0481292 ],
       [ 0.8571995 , -0.5512727 ,  0.12522113, ..., -0.12230115,
         1.0029335 ,  0.71867335]], dtype=float32)

In [21]:
train2 = pd.DataFrame(vectors)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(vectors,train['target'],test_size = .3, random_state = 42)

In [24]:
from sklearn.svm import LinearSVC

In [27]:
svc = LinearSVC(random_state = 42, dual = False)

In [28]:
svc.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

In [29]:
from sklearn.metrics import accuracy_score, classification_report

In [30]:
svc_pred = svc.predict(X_test)

In [31]:
print(classification_report(y_test,svc_pred))

              precision    recall  f1-score   support

           0       0.72      0.79      0.75      1318
           1       0.66      0.58      0.62       966

   micro avg       0.70      0.70      0.70      2284
   macro avg       0.69      0.68      0.68      2284
weighted avg       0.69      0.70      0.69      2284



In [33]:
print(accuracy_score(y_test,svc_pred))

0.696584938704028


# Could be better.... maybe try the centering approach? Pipeline?

# pipeline approach

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),
                    ('clf',LinearSVC()),
])

In [112]:
X = train['text']
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [42]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [43]:
predictions = text_clf.predict(X_test)

In [45]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82      1446
           1       0.77      0.74      0.75      1067

   micro avg       0.79      0.79      0.79      2513
   macro avg       0.79      0.79      0.79      2513
weighted avg       0.79      0.79      0.79      2513



In [48]:
print(accuracy_score(y_test,predictions))

0.7938718662952646


In [52]:
#create submission for easy pipeline

In [94]:
thing = text_clf.predict(test['text'])
submission = pd.DataFrame({
    'id': test.id,
    'target': thing
})
submission.to_csv('submission_pipeline_v1.csv', index = False)

# A bit better...centering approach? find some way to use locations/keywords?

scores .78425 on Kaggle leaderboard

# Centering approach with tfidf, using https://www.kaggle.com/saga21/disaster-tweets-comp-introduction-to-nlp/notebook as a reference

In [100]:
vectors

array([[ 0.13452284,  0.23726209, -0.11959665, ...,  1.0300049 ,
         2.0431578 ,  1.0130912 ],
       [ 1.0309796 , -1.241845  , -0.70834446, ...,  2.0340476 ,
         1.3641878 , -0.49217844],
       [-0.07072449,  1.0874252 , -1.0262684 , ...,  0.14652216,
         0.01183708,  0.8379208 ],
       ...,
       [ 0.81655306, -0.525268  ,  0.08102124, ...,  0.07882321,
         0.07872479,  1.1387758 ],
       [ 1.3380736 , -0.34899577,  0.02064458, ..., -0.07607778,
         0.99538714,  1.0481292 ],
       [ 0.8571995 , -0.5512727 ,  0.12522113, ..., -0.12230115,
         1.0029335 ,  0.71867335]], dtype=float32)

In [101]:
tfidf = TfidfVectorizer()

In [103]:
v2 = tfidf.fit_transform(train['text'])

In [105]:
v2.data

array([0.2530913 , 0.4187282 , 0.1893734 , ..., 0.29286755, 0.32283076,
       0.46333344])

In [106]:
vectors.shape

(7613, 96)

In [107]:
v2.shape

(7613, 21637)

In [110]:
#v2_mean = v2.mean(axis = 0)
#v2_centered = pd.DataFrame([vec - v2_mean for vec in v2])
# center count vectorizer first, then tfidf transform? dunno

# Center the vectors
vec_mean = vectors.mean(axis=0)
centered = pd.DataFrame([vec - vec_mean for vec in vectors])

In [109]:
v2_mean

matrix([[8.55190925e-04, 1.58998920e-04, 4.11528597e-05, ...,
         5.55664717e-05, 4.80836020e-05, 4.38917349e-05]])

In [111]:
vec_mean

array([ 0.43400463,  0.01175474, -0.1967308 ,  0.13403375,  1.1274575 ,
        0.753622  ,  0.8892787 ,  0.3574361 ,  1.2680839 ,  1.2098266 ,
        0.15303732, -0.11824041,  0.00519462, -0.5077297 , -0.7510572 ,
       -0.54077667, -0.39519203,  0.2712668 , -0.31405494, -0.55249166,
        0.37466276, -0.32662278, -0.18854298, -0.4474655 , -0.64347464,
        0.05979219, -0.43043375, -0.73664963,  0.6080096 , -0.675254  ,
        0.6287136 ,  0.09368932,  0.12316993, -0.30363876,  0.13973513,
       -1.1184596 ,  1.0485252 , -0.9474972 , -1.0091994 , -0.33984697,
        1.3346994 ,  0.2737795 , -0.01350499, -1.5115973 , -0.14355847,
       -0.08093356,  0.02371505, -0.39492685, -0.7177279 ,  0.50176126,
        0.6513506 , -0.75678104, -0.2604513 , -0.0474222 , -1.485926  ,
        0.48788255,  0.88382834,  0.70749426,  0.2435302 ,  0.24587278,
        0.65339845, -0.25248578,  0.75328124,  0.18466805,  0.6883527 ,
       -0.25942162,  0.54925036, -1.0541772 , -0.5782776 ,  0.61

In [123]:
cent_svc = LinearSVC(random_state = 21, dual = False)

In [127]:
X_train, X_valid, y_train, y_valid = train_test_split(centered[:len(train)], train['target'], 
                                                      test_size=0.1, random_state=21)

In [130]:
cent_svc.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=21, tol=0.0001,
     verbose=0)

In [133]:
cent_svc_pred = cent_svc.predict(X_valid)
print(classification_report(y_valid,cent_svc_pred))
print(accuracy_score(y_valid,cent_svc_pred))

              precision    recall  f1-score   support

           0       0.72      0.79      0.75       437
           1       0.68      0.58      0.62       325

   micro avg       0.70      0.70      0.70       762
   macro avg       0.70      0.69      0.69       762
weighted avg       0.70      0.70      0.70       762

0.7020997375328084


# the link's notebook scores 81.759%...and works better in the kaggle kernel.
# maybe 'en_core_web_lg' vs 'en_core_web_sm'?

# either way, the pipeline slightly outperforms this centering approach.