In [1]:
import numpy as np 
import pandas as pd 
import re 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [2]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/faa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
train = pd.read_csv('./data/disaster/train.csv')

In [5]:
train.shape

(7613, 5)

In [6]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [8]:
train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [9]:
train['text'].duplicated().sum()

110

In [10]:
train['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [11]:
X = train['text']
Y = train['target']

In [12]:
print(X) 
print(Y)

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object
0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64


In [13]:
test = pd.read_csv('./data/disaster/test.csv', encoding='ISO-8859-1')

In [14]:
test.shape

(3263, 4)

In [15]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [16]:
test.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [17]:
port_stem = PorterStemmer()

In [20]:
def stemming(content): 
    stemmed_content = re.sub('[^a-zA-Z]',' ', content) 
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content) 
    return stemmed_content

In [21]:
train['text'] = train['text'].apply(stemming) 
test['text'] = test['text'].apply(stemming) 
print(train['text'])

0               deed reason earthquak may allah forgiv us
1                    forest fire near la rong sask canada
2       resid ask shelter place notifi offic evacu she...
3             peopl receiv wildfir evacu order california
4       got sent photo rubi alaska smoke wildfir pour ...
                              ...                        
7608    two giant crane hold bridg collaps nearbi home...
7609    aria ahrari thetawniest control wild fire cali...
7610             utc km volcano hawaii http co zdtoyd ebj
7611    polic investig e bike collid car littl portug ...
7612    latest home raze northern california wildfir a...
Name: text, Length: 7613, dtype: object


In [23]:
print(test['text'])

0                                happen terribl car crash
1           heard earthquak differ citi stay safe everyon
2       forest fire spot pond gees flee across street ...
3                          apocalyps light spokan wildfir
4                      typhoon soudelor kill china taiwan
                              ...                        
3258         earthquak safeti lo angel safeti fasten xrwn
3259    storm ri wors last hurrican citi amp other har...
3260         green line derail chicago http co utbxlcbiuy
3261    meg issu hazard weather outlook hwo http co x ...
3262      cityofcalgari activ municip emerg plan yycstorm
Name: text, Length: 3263, dtype: object


In [24]:
vectorizer = TfidfVectorizer(max_features=600) 
vectorizer.fit(X) 

TfidfVectorizer(max_features=600)

In [25]:
X = vectorizer.transform(X) 

In [26]:
text = vectorizer.transform(test['text'])

In [27]:
print(X) 

  (0, 536)	0.3814006158603521
  (0, 502)	0.28049393121953003
  (0, 492)	0.15845138263717123
  (0, 380)	0.39435973032059596
  (0, 367)	0.18682534212295943
  (0, 329)	0.40389734755751416
  (0, 170)	0.45255881506816664
  (0, 35)	0.29507628836912997
  (0, 21)	0.3245212078358395
  (1, 350)	0.6298367069780924
  (1, 212)	0.6137414156440512
  (1, 202)	0.4760537756033752
  (2, 511)	0.1510040745720882
  (2, 379)	0.37798233372893636
  (2, 378)	0.28063039085910024
  (2, 357)	0.2668913773096814
  (2, 272)	0.29612258928806307
  (2, 178)	0.36448479021297747
  (2, 91)	0.22283325877396032
  (2, 62)	0.3309812971719058
  (2, 35)	0.4836116566731289
  (2, 21)	0.2659350227943865
  (3, 389)	0.48947301764725804
  (3, 272)	0.25712750608916923
  (3, 178)	0.6329747780486183
  :	:
  (7610, 367)	0.3402022815767392
  (7610, 266)	0.23494541420326454
  (7610, 112)	0.22160666508267315
  (7611, 577)	0.22960537419170535
  (7611, 397)	0.3171594020803176
  (7611, 309)	0.37819374750209866
  (7611, 304)	0.3394706719120937
 

In [29]:
print(text)

  (0, 125)	0.6936921484373234
  (0, 97)	0.720271617652959
  (1, 470)	0.717036082334008
  (1, 244)	0.6970360511703091
  (2, 433)	0.6511182069949734
  (2, 212)	0.5997150454120467
  (2, 202)	0.46517410162871037
  (3, 305)	1.0
  (4, 530)	0.5473092927470222
  (4, 291)	0.6009117588747461
  (4, 109)	0.5825440722544633
  (6, 471)	0.5223477255460575
  (6, 454)	0.6460854727080357
  (6, 304)	0.5565306959840531
  (9, 216)	1.0
  (10, 307)	1.0
  (15, 354)	0.3161723374389721
  (15, 326)	0.8847509785695656
  (15, 266)	0.11768569059908579
  (15, 202)	0.3017928087401779
  (15, 112)	0.11100422415162135
  (17, 528)	0.6987184147170306
  (17, 447)	0.6648172171163438
  (17, 266)	0.19220611220521974
  (17, 112)	0.18129383660774168
  :	:
  (3256, 143)	0.43191100024059154
  (3256, 20)	0.4420750093530315
  (3257, 136)	0.5942441817506415
  (3257, 97)	0.5763250742925107
  (3257, 77)	0.561002015323691
  (3259, 578)	0.31398535336494443
  (3259, 474)	0.2688029003236345
  (3259, 471)	0.2654125980543364
  (3259, 400)	0

In [30]:
from sklearn.tree import DecisionTreeClassifier
model1 = DecisionTreeClassifier(criterion='entropy', random_state=-0) 
model1.fit(X, Y)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [31]:
DT_pred = model1.predict(text) 

In [32]:
sub = pd.read_csv('./data/disaster/sample_submission.csv')

In [34]:
sub['target'] = DT_pred
sub.to_csv('./data/disaster/DecisionTree.csv', index=False)

In [35]:
sub

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,0
3261,10874,1
