<h1> Title </h1>

![](ml-pipeline-sketch.jpg)


In [72]:
import pandas as pd
from sklearn.metrics import accuracy_score

df = pd.read_csv("data/train.csv")
df = df.drop(['keyword', 'location'], axis=1)
df_test = pd.read_csv("data/test.csv")
df_test = df_test.drop(['keyword', 'location'], axis=1)
df_sub = pd.read_csv("data/sample_submission.csv")

# Train-validation split

In [73]:
pct_val = 0.1
n_val = int(pct_val * len(df))
df_train, df_val = df.iloc[:-n_val], df.iloc[-n_val:]
df_train.shape, df_val.shape

((6852, 3), (761, 3))

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
df_train2, df_val2 = train_test_split(df, test_size=pct_val, )
df_train2.shape

(6851, 3)

In [8]:
df_val2.shape

(762, 3)

In [9]:
df_train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
df_val.head()

Unnamed: 0,id,text,target
6852,9820,@PTSD_Chat Yes. I feel the root of that is Sha...,1
6853,9822,Hiroshima: They told me to paint my story: Eig...,1
6854,9823,Photo: lavenderpoetrycafe: The Forgotten Histo...,1
6855,9826,Trauma injuries involving kids and sport usual...,1
6856,9828,Butt Trauma Extraordinaire,1


In [12]:
df_train['target'].value_counts(normalize=True)

0    0.574431
1    0.425569
Name: target, dtype: float64

In [13]:
df_val['target'].value_counts(normalize=True)

0    0.533509
1    0.466491
Name: target, dtype: float64

# Hand crafted model

In [14]:
# Count vectorizer intuition

In [15]:
tweet   = "@PTSD_Chat Yes. I feel the root of that is S"

columns = ['the', 'yes', 'a', 'chaos', 'tornado']

x       = [1, 1, 0, 0, 0]

In [16]:
df_train.head(20)

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1
5,8,#RockyFire Update => California Hwy. 20 closed...,1
6,10,#flood #disaster Heavy rain causes flash flood...,1
7,13,I'm on top of the hill and I can see a fire in...,1
8,14,There's an emergency evacuation happening now ...,1
9,15,I'm afraid that the tornado is coming to our a...,1


In [17]:
disaster_words = ['pain', 'trauma', 'tornado', 'crash', 'hurricane', 'flood', 'dead', 'death', 'fire', 'forest']

In [18]:
"a a a ".count("a")

3

In [19]:
def create_features(text):
    t = text.lower()
    representation = {}
    for word in disaster_words:
        representation[word] = t.count(word)
    return representation

In [23]:
tweet = "I'm in pain because of the tornado. Much death. So fire. Fire all around."
x = create_features(tweet)

In [24]:
x

{'pain': 1,
 'trauma': 0,
 'tornado': 1,
 'crash': 0,
 'hurricane': 0,
 'flood': 0,
 'dead': 0,
 'death': 1,
 'fire': 2,
 'forest': 0}

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
vect = CountVectorizer(vocabulary=disaster_words)

In [30]:
X = vect.fit_transform([tweet])

In [31]:
df_X = pd.DataFrame(X.todense(), columns=vect.get_feature_names())

In [33]:
df_X.iloc[0].to_dict()

{'pain': 1,
 'trauma': 0,
 'tornado': 1,
 'crash': 0,
 'hurricane': 0,
 'flood': 0,
 'dead': 0,
 'death': 1,
 'fire': 2,
 'forest': 0}

In [34]:
df_X.iloc[0].to_dict() == x

True

In [35]:
df_X

Unnamed: 0,pain,trauma,tornado,crash,hurricane,flood,dead,death,fire,forest
0,1,0,1,0,0,0,0,1,2,0


In [36]:
X2 = vect.fit_transform([tweet, "The forest was so nice"])

In [37]:
df_X2 = pd.DataFrame(X2.todense(), columns=vect.get_feature_names())
df_X2

Unnamed: 0,pain,trauma,tornado,crash,hurricane,flood,dead,death,fire,forest
0,1,0,1,0,0,0,0,1,2,0
1,0,0,0,0,0,0,0,0,0,1


In [39]:
features_by_hand = df_train['text'].apply(create_features)

In [45]:
df_features = pd.DataFrame(features_by_hand.to_list())

In [47]:
df_train = df_train.join(df_features)

In [49]:
vect2 = CountVectorizer(vocabulary=disaster_words)

In [50]:
X3 = vect2.fit_transform(df_train['text'])
df_features_sklearn = pd.DataFrame(X3.todense(), columns=vect2.get_feature_names())
df_features_sklearn

Unnamed: 0,pain,trauma,tornado,crash,hurricane,flood,dead,death,fire,forest
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
6847,0,1,0,0,0,0,0,0,0,0
6848,0,1,0,0,0,0,0,0,0,0
6849,0,1,0,0,0,0,0,0,0,0
6850,0,1,0,0,0,0,0,0,0,0


In [56]:
df_features_sklearn.iloc[3]['fire']

0

In [57]:
df_features.iloc[3]['fire']

1

In [59]:
df_train.iloc[3]['text']

'13,000 people receive #wildfires evacuation orders in California '

In [55]:
(df_features_sklearn == df_features)

Unnamed: 0,pain,trauma,tornado,crash,hurricane,flood,dead,death,fire,forest
0,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,False,True
4,True,True,True,True,True,True,True,True,False,True
...,...,...,...,...,...,...,...,...,...,...
6847,True,True,True,True,True,True,True,True,True,True
6848,True,True,True,True,True,True,True,True,True,True
6849,True,True,True,True,True,True,True,True,True,True
6850,True,True,True,True,True,True,True,True,True,True


In [63]:
vect3 = CountVectorizer(max_features=100, stop_words='english', )
X4 = vect3.fit_transform(df_train['text'])
df_features_large = pd.DataFrame(X4.todense(), columns=vect3.get_feature_names())
df_features_large.head()

Unnamed: 0,11,2015,accident,amp,army,atomic,attack,best,black,body,...,watch,water,way,work,world,year,years,youtube,û_,ûªs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
df_features_large.columns

Index(['11', '2015', 'accident', 'amp', 'army', 'atomic', 'attack', 'best',
       'black', 'body', 'bomb', 'bomber', 'bombing', 'buildings', 'burning',
       'california', 'car', 'city', 'collapse', 'crash', 'damage', 'day',
       'dead', 'death', 'did', 'disaster', 'don', 'emergency', 'families',
       'fatal', 'fear', 'fires', 'flames', 'flood', 'floods', 'forest', 'god',
       'going', 'good', 'got', 'gt', 'help', 'hiroshima', 'home', 'homes',
       'hot', 'http', 'https', 'injured', 'japan', 'just', 'killed', 'know',
       'latest', 'legionnaires', 'let', 'life', 'like', 'live', 'll', 'lol',
       'look', 'love', 'make', 'man', 'mass', 'need', 'new', 'news', 'nuclear',
       'old', 'people', 'police', 'read', 'really', 'reddit', 'right', 'rt',
       'say', 'school', 'storm', 'suicide', 'think', 'time', 'today', 'train',
       've', 'video', 'want', 'war', 'watch', 'water', 'way', 'work', 'world',
       'year', 'years', 'youtube', 'û_', 'ûªs'],
      dtype='object')

In [62]:
df_features_sklearn

Unnamed: 0,pain,trauma,tornado,crash,hurricane,flood,dead,death,fire,forest
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
6847,0,1,0,0,0,0,0,0,0,0
6848,0,1,0,0,0,0,0,0,0,0
6849,0,1,0,0,0,0,0,0,0,0
6850,0,1,0,0,0,0,0,0,0,0


In [82]:
#vect2 = CountVectorizer(vocabulary=disaster_words)
# X3 = vect2.fit_transform(df_train['text'])
df_train = df_train.join(df_features_sklearn)
df_train.head()

Unnamed: 0,id,text,target,pain,trauma,tornado,crash,hurricane,flood,dead,death,fire,forest
0,1,Our Deeds are the Reason of this #earthquake M...,1,0,0,0,0,0,0,0,0,0,0
1,4,Forest fire near La Ronge Sask. Canada,1,0,0,0,0,0,0,0,0,1,1
2,5,All residents asked to 'shelter in place' are ...,1,0,0,0,0,0,0,0,0,0,0
3,6,"13,000 people receive #wildfires evacuation or...",1,0,0,0,0,0,0,0,0,0,0
4,7,Just got sent this photo from Ruby #Alaska as ...,1,0,0,0,0,0,0,0,0,0,0


In [74]:
X_val = vect2.transform(df_val['text'])
df_val = df_val.reset_index(drop=True).join(pd.DataFrame(X_val.todense(), columns=vect2.get_feature_names()))
df_val.head()

Unnamed: 0,id,text,target,pain,trauma,tornado,crash,hurricane,flood,dead,death,fire,forest
0,9820,@PTSD_Chat Yes. I feel the root of that is Sha...,1,0,1,0,0,0,0,0,0,0,0
1,9822,Hiroshima: They told me to paint my story: Eig...,1,0,1,0,0,0,0,0,0,0,0
2,9823,Photo: lavenderpoetrycafe: The Forgotten Histo...,1,0,1,0,0,0,0,0,0,0,0
3,9826,Trauma injuries involving kids and sport usual...,1,0,1,0,0,0,0,0,0,0,0
4,9828,Butt Trauma Extraordinaire,1,0,1,0,0,0,0,0,0,0,0


In [75]:
X_test = vect2.transform(df_test['text'])
df_test = df_test.reset_index(drop=True).join(pd.DataFrame(X_test.todense(), columns=vect2.get_feature_names()))
df_test.head()

Unnamed: 0,id,text,pain,trauma,tornado,crash,hurricane,flood,dead,death,fire,forest
0,0,Just happened a terrible car crash,0,0,0,1,0,0,0,0,0,0
1,2,"Heard about #earthquake is different cities, s...",0,0,0,0,0,0,0,0,0,0
2,3,"there is a forest fire at spot pond, geese are...",0,0,0,0,0,0,0,0,1,1
3,9,Apocalypse lighting. #Spokane #wildfires,0,0,0,0,0,0,0,0,0,0
4,11,Typhoon Soudelor kills 28 in China and Taiwan,0,0,0,0,0,0,0,0,0,0


# Hand crafted model over count vectorized representation

In [76]:
#disaster_words = ['pain', 'trauma', 'tornado', 'crash', 'hurricane', 'flood', 'dead', 'death', 'fire', 'forest']
features = disaster_words

In [84]:
def model(sample):
    for feature in features:
        if sample[feature] != 0:
            return 1
    return 0
    

In [83]:
df_train.head()

Unnamed: 0,id,text,target,pain,trauma,tornado,crash,hurricane,flood,dead,death,fire,forest
0,1,Our Deeds are the Reason of this #earthquake M...,1,0,0,0,0,0,0,0,0,0,0
1,4,Forest fire near La Ronge Sask. Canada,1,0,0,0,0,0,0,0,0,1,1
2,5,All residents asked to 'shelter in place' are ...,1,0,0,0,0,0,0,0,0,0,0
3,6,"13,000 people receive #wildfires evacuation or...",1,0,0,0,0,0,0,0,0,0,0
4,7,Just got sent this photo from Ruby #Alaska as ...,1,0,0,0,0,0,0,0,0,0,0


In [85]:
df_train['prediction'] = df_train.apply(model, axis=1)

In [88]:
accuracy_score(df_train['target'], df_train['prediction'])

0.6004086398131933

In [89]:
df_val['prediction'] = df_val.apply(model, axis=1)

In [90]:
accuracy_score(df_val['target'], df_val['prediction'])

0.5755584756898817

In [95]:
df_test['target'] = df_test.apply(model, axis=1)

In [96]:
df_test.head()

Unnamed: 0,id,text,pain,trauma,tornado,crash,hurricane,flood,dead,death,fire,forest,target
0,0,Just happened a terrible car crash,0,0,0,1,0,0,0,0,0,0,1
1,2,"Heard about #earthquake is different cities, s...",0,0,0,0,0,0,0,0,0,0,0
2,3,"there is a forest fire at spot pond, geese are...",0,0,0,0,0,0,0,0,1,1,1
3,9,Apocalypse lighting. #Spokane #wildfires,0,0,0,0,0,0,0,0,0,0,0
4,11,Typhoon Soudelor kills 28 in China and Taiwan,0,0,0,0,0,0,0,0,0,0,0


In [97]:
df_sub.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [98]:
df_test[['id', 'target']].to_csv("handcrafted-model.csv", index=False)