In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import re
import string
import nltk
from nltk.stem.porter import *

In [3]:
entrenamiento_df = pd.read_csv('Archivos/entrenamiento_df.csv')
entrenamiento_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
test_df = pd.read_csv('Archivos/test_df.csv')
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [6]:
entrenamiento_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [8]:
#Reemplazo valores nulos de keyword por el mas frecuente

imp = SimpleImputer(strategy="most_frequent")
entrenamiento_df['keyword'] = imp.fit_transform(entrenamiento_df[['keyword']])
entrenamiento_df

Unnamed: 0,id,keyword,location,text,target
0,1,fatalities,,Our Deeds are the Reason of this #earthquake M...,1
1,4,fatalities,,Forest fire near La Ronge Sask. Canada,1
2,5,fatalities,,All residents asked to 'shelter in place' are ...,1
3,6,fatalities,,"13,000 people receive #wildfires evacuation or...",1
4,7,fatalities,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,fatalities,,Two giant cranes holding a bridge collapse int...,1
7609,10870,fatalities,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,fatalities,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,fatalities,,Police investigating after an e-bike collided ...,1


In [9]:
test_df['keyword'] = imp.transform(test_df[['keyword']])
test_df

Unnamed: 0,id,keyword,location,text
0,0,fatalities,,Just happened a terrible car crash
1,2,fatalities,,"Heard about #earthquake is different cities, s..."
2,3,fatalities,,"there is a forest fire at spot pond, geese are..."
3,9,fatalities,,Apocalypse lighting. #Spokane #wildfires
4,11,fatalities,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,fatalities,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,fatalities,,Storm in RI worse than last hurricane. My city...
3260,10868,fatalities,,Green Line derailment in Chicago http://t.co/U...
3261,10874,fatalities,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [10]:
#La columna location tiene demasiados nulos, mejor la elimino

entrenamiento_df = entrenamiento_df.drop('location', axis=1)
entrenamiento_df

Unnamed: 0,id,keyword,text,target
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1
2,5,fatalities,All residents asked to 'shelter in place' are ...,1
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,fatalities,Police investigating after an e-bike collided ...,1


In [11]:
test_df = test_df.drop('location', axis=1)
test_df

Unnamed: 0,id,keyword,text
0,0,fatalities,Just happened a terrible car crash
1,2,fatalities,"Heard about #earthquake is different cities, s..."
2,3,fatalities,"there is a forest fire at spot pond, geese are..."
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...


In [12]:
entrenamiento_df['hashtags_count'] = entrenamiento_df['text'].apply(lambda x: x.count('#'))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2
...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0


In [13]:
test_df['hashtags_count'] = test_df['text'].apply(lambda x: x.count('#'))
test_df

Unnamed: 0,id,keyword,text,hashtags_count
0,0,fatalities,Just happened a terrible car crash,0
1,2,fatalities,"Heard about #earthquake is different cities, s...",1
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0
...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0


In [14]:
entrenamiento_df['longitud_text'] = entrenamiento_df['text'].transform(lambda x: len(x))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88
...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137


In [15]:
test_df['longitud_text'] = test_df['text'].transform(lambda x: len(x))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text
0,0,fatalities,Just happened a terrible car crash,0,34
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45
...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65


In [16]:
entrenamiento_df['palabras_count'] = entrenamiento_df['text'].transform(lambda x: len(x.split()))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16
...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19


In [17]:
test_df['palabras_count'] = test_df['text'].transform(lambda x: len(x.split()))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count
0,0,fatalities,Just happened a terrible car crash,0,34,6
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8
...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7


In [18]:
entrenamiento_df['mentions_count'] = entrenamiento_df['text'].apply(lambda x: x.count('@'))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0
...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0


In [19]:
test_df['mentions_count'] = test_df['text'].apply(lambda x: x.count('@'))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count
0,0,fatalities,Just happened a terrible car crash,0,34,6,0
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0
...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0


In [20]:
entrenamiento_df['palabras_unicas_count'] = entrenamiento_df['text'].apply(lambda x: len(set(str(x).split())))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15
...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19


In [21]:
test_df['palabras_unicas_count'] = test_df['text'].apply(lambda x: len(set(str(x).split())))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8
...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7


In [22]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
entrenamiento_df['stopwords_count'] = entrenamiento_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
entrenamiento_df

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/elnic10/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7
...,...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5


In [23]:
test_df['stopwords_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6,2
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2
...,...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0


In [24]:
entrenamiento_df['url_count'] = entrenamiento_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6,0
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0,0
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11,0
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1,0
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7,0
...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2,1
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9,0
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2,1
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5,0


In [25]:
test_df['url_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6,2,0
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2,0
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9,0
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0,0
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2,0
...,...,...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0,0
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6,0
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1,1
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0,1


In [26]:
entrenamiento_df['longitud_palabra_mean'] = entrenamiento_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6,0,4.384615
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0,0,4.571429
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11,0,5.090909
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1,0,7.125000
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7,0,4.500000
...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2,1,6.636364
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9,0,5.300000
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2,1,7.250000
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5,0,6.263158


In [27]:
test_df['longitud_palabra_mean'] = test_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6,2,0,4.833333
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2,0,6.222222
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9,0,4.105263
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0,0,9.250000
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2,0,4.750000
...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0,0,6.000000
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6,0,5.086957
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1,1,8.333333
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0,1,8.428571


In [28]:
entrenamiento_df['punctuation_count'] = entrenamiento_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6,0,4.384615,1
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0,0,4.571429,1
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11,0,5.090909,3
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1,0,7.125000,2
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7,0,4.500000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2,1,6.636364,5
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9,0,5.300000,5
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2,1,7.250000,11
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5,0,6.263158,5


In [29]:
test_df['punctuation_count'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6,2,0,4.833333,0
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2,0,6.222222,3
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9,0,4.105263,2
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0,0,9.250000,3
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2,0,4.750000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0,0,6.000000,0
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6,0,5.086957,5
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1,1,8.333333,5
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0,1,8.428571,7


In [30]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punc(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

In [31]:
slang_abbrev_dict = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'B4N': 'Bye For Now',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FWIW': 'For What It\'s Worth',
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you',
    'ILU': 'I Love You',
    'IMHO': 'In My Humble Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My Ass Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'OIC': 'Oh I See',
    'OMG': 'Oh My God',
    'PITA': 'Pain In The Ass',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'QPSA?': 'Que Pasa?',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My Ass Off',
    'SK8': 'Skate',
    'STATS': 'Your sex and age',
    'ASL': 'Age, Sex, Location',
    'THX': 'Thank You',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'WB': 'Welcome Back',
    'WTF': 'What The Fuck',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'W8': 'Wait',
    '7K': 'Sick:-D Laugher'
}

def unslang(text):
    if text.upper() in slang_abbrev_dict.keys():
        return slang_abbrev_dict[text.upper()]
    else:
        return text

In [32]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

stemmer = PorterStemmer()

def remove_stopwords(text):
    text = [word for word in text if word not in stopwords]
    return text

def stemming(text):
    text = [stemmer.stem(word) for word in text]
    return text

In [33]:
for datas in [entrenamiento_df, test_df]:
    datas['cleaned_text'] = datas['text'].apply(lambda x : remove_url(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_html(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_emoji(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : unslang(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_punc(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : tokenization(x.lower()))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_stopwords(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : stemming(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : ' '.join(x))

entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6,0,4.384615,1,deed reason earthquak may allah forgiv us
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0,0,4.571429,1,forest fire near la rong sask canada
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11,0,5.090909,3,resid ask shelter place notifi offic evacu she...
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1,0,7.125000,2,13000 peopl receiv wildfir evacu order califor...
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7,0,4.500000,2,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2,1,6.636364,5,two giant crane hold bridg collaps nearbi home
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9,0,5.300000,5,ariaahrari thetawniest control wild fire calif...
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2,1,7.250000,11,m194 0104 utc5km volcano hawaii
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5,0,6.263158,5,polic investig ebik collid car littl portug eb...


In [34]:
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6,2,0,4.833333,0,happen terribl car crash
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2,0,6.222222,3,heard earthquak differ citi stay safe everyon
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9,0,4.105263,2,forest fire spot pond gees flee across street ...
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0,0,9.250000,3,apocalyps light spokan wildfir
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2,0,4.750000,0,typhoon soudelor kill 28 china taiwan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0,0,6.000000,0,earthquak safeti lo angel ûò safeti fasten xrwn
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6,0,5.086957,5,storm ri wors last hurrican cityamp3oth hardes...
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1,1,8.333333,5,green line derail chicago
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0,1,8.428571,7,meg issu hazard weather outlook hwo


In [35]:
X = entrenamiento_df.drop(['id', 'text', 'target'], axis = 1)
y = entrenamiento_df['target']
display(X)
display(y)

Unnamed: 0,keyword,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text
0,fatalities,1,69,13,0,13,6,0,4.384615,1,deed reason earthquak may allah forgiv us
1,fatalities,0,38,7,0,7,0,0,4.571429,1,forest fire near la rong sask canada
2,fatalities,0,133,22,0,20,11,0,5.090909,3,resid ask shelter place notifi offic evacu she...
3,fatalities,1,65,8,0,8,1,0,7.125000,2,13000 peopl receiv wildfir evacu order califor...
4,fatalities,2,88,16,0,15,7,0,4.500000,2,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...,...,...,...,...,...,...,...,...
7608,fatalities,0,83,11,0,11,2,1,6.636364,5,two giant crane hold bridg collaps nearbi home
7609,fatalities,0,125,20,2,17,9,0,5.300000,5,ariaahrari thetawniest control wild fire calif...
7610,fatalities,0,65,8,0,8,2,1,7.250000,11,m194 0104 utc5km volcano hawaii
7611,fatalities,0,137,19,0,19,5,0,6.263158,5,polic investig ebik collid car littl portug eb...


0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [36]:
test_df = test_df.drop(['id', 'text'], axis = 1)
test_df

Unnamed: 0,keyword,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text
0,fatalities,0,34,6,0,6,2,0,4.833333,0,happen terribl car crash
1,fatalities,1,64,9,0,9,2,0,6.222222,3,heard earthquak differ citi stay safe everyon
2,fatalities,0,96,19,0,19,9,0,4.105263,2,forest fire spot pond gees flee across street ...
3,fatalities,2,40,4,0,4,0,0,9.250000,3,apocalyps light spokan wildfir
4,fatalities,0,45,8,0,8,2,0,4.750000,0,typhoon soudelor kill 28 china taiwan
...,...,...,...,...,...,...,...,...,...,...,...
3258,fatalities,0,55,8,0,7,0,0,6.000000,0,earthquak safeti lo angel ûò safeti fasten xrwn
3259,fatalities,0,139,23,0,22,6,0,5.086957,5,storm ri wors last hurrican cityamp3oth hardes...
3260,fatalities,0,55,6,0,6,1,1,8.333333,5,green line derail chicago
3261,fatalities,0,65,7,0,7,0,1,8.428571,7,meg issu hazard weather outlook hwo


In [37]:
#One hot encoding para keyword

encoder = OneHotEncoder(sparse = False)
transformed = encoder.fit_transform(X[['keyword']])
onehot_df = pd.DataFrame(transformed, columns = encoder.get_feature_names())
X = pd.concat([X, onehot_df], axis = 1).drop(['keyword'], axis = 1)
X

Unnamed: 0,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text,...,x0_weapons,x0_whirlwind,x0_wild fires,x0_wildfire,x0_windstorm,x0_wounded,x0_wounds,x0_wreck,x0_wreckage,x0_wrecked
0,1,69,13,0,13,6,0,4.384615,1,deed reason earthquak may allah forgiv us,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,38,7,0,7,0,0,4.571429,1,forest fire near la rong sask canada,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,133,22,0,20,11,0,5.090909,3,resid ask shelter place notifi offic evacu she...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,65,8,0,8,1,0,7.125000,2,13000 peopl receiv wildfir evacu order califor...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,88,16,0,15,7,0,4.500000,2,got sent photo rubi alaska smoke wildfir pour ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,83,11,0,11,2,1,6.636364,5,two giant crane hold bridg collaps nearbi home,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7609,0,125,20,2,17,9,0,5.300000,5,ariaahrari thetawniest control wild fire calif...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7610,0,65,8,0,8,2,1,7.250000,11,m194 0104 utc5km volcano hawaii,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7611,0,137,19,0,19,5,0,6.263158,5,polic investig ebik collid car littl portug eb...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
transformed = encoder.transform(test_df[['keyword']])
onehot_df = pd.DataFrame(transformed, columns = encoder.get_feature_names())
test_df = pd.concat([test_df, onehot_df], axis = 1).drop(['keyword'], axis = 1)
test_df

Unnamed: 0,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text,...,x0_weapons,x0_whirlwind,x0_wild fires,x0_wildfire,x0_windstorm,x0_wounded,x0_wounds,x0_wreck,x0_wreckage,x0_wrecked
0,0,34,6,0,6,2,0,4.833333,0,happen terribl car crash,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,64,9,0,9,2,0,6.222222,3,heard earthquak differ citi stay safe everyon,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,96,19,0,19,9,0,4.105263,2,forest fire spot pond gees flee across street ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,40,4,0,4,0,0,9.250000,3,apocalyps light spokan wildfir,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,45,8,0,8,2,0,4.750000,0,typhoon soudelor kill 28 china taiwan,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0,55,8,0,7,0,0,6.000000,0,earthquak safeti lo angel ûò safeti fasten xrwn,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3259,0,139,23,0,22,6,0,5.086957,5,storm ri wors last hurrican cityamp3oth hardes...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3260,0,55,6,0,6,1,1,8.333333,5,green line derail chicago,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3261,0,65,7,0,7,0,1,8.428571,7,meg issu hazard weather outlook hwo,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
#TF IDF

tfidfvectorizer = TfidfVectorizer()
transformed = tfidfvectorizer.fit_transform(X['cleaned_text']).toarray()
tfidf_df = pd.DataFrame(transformed, columns = tfidfvectorizer.get_feature_names())
X = pd.concat([X, tfidf_df], axis = 1).drop(['cleaned_text'], axis = 1)
X

Unnamed: 0,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,x0_ablaze,...,ûò800000,ûòthe,ûòåêcnbc,ûó,ûóbbc,ûóher,ûókodi,ûóneglig,ûótech,ûówe
0,1,69,13,0,13,6,0,4.384615,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,38,7,0,7,0,0,4.571429,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,133,22,0,20,11,0,5.090909,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,65,8,0,8,1,0,7.125000,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,88,16,0,15,7,0,4.500000,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,83,11,0,11,2,1,6.636364,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7609,0,125,20,2,17,9,0,5.300000,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7610,0,65,8,0,8,2,1,7.250000,11,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7611,0,137,19,0,19,5,0,6.263158,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
transformed = tfidfvectorizer.transform(test_df['cleaned_text']).toarray()
tfidf_df = pd.DataFrame(transformed, columns = tfidfvectorizer.get_feature_names())
test_df = pd.concat([test_df, tfidf_df], axis = 1).drop(['cleaned_text'], axis = 1)
test_df

Unnamed: 0,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,x0_ablaze,...,ûò800000,ûòthe,ûòåêcnbc,ûó,ûóbbc,ûóher,ûókodi,ûóneglig,ûótech,ûówe
0,0,34,6,0,6,2,0,4.833333,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,64,9,0,9,2,0,6.222222,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,96,19,0,19,9,0,4.105263,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,40,4,0,4,0,0,9.250000,3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,45,8,0,8,2,0,4.750000,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0,55,8,0,7,0,0,6.000000,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3259,0,139,23,0,22,6,0,5.086957,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3260,0,55,6,0,6,1,1,8.333333,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3261,0,65,7,0,7,0,1,8.428571,7,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
nb = MultinomialNB()
nb.fit(X_train, y_train)
print('Train Score', nb.score(X_train, y_train))
print('Test Score', nb.score(X_test, y_test))

Train Score 0.8160798738833421
Test Score 0.759453781512605


In [42]:
predicts = nb.predict(test_df)

In [43]:
submit = pd.read_csv('setDeDatos/nlp-getting-started/sample_submission.csv')
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3263 non-null   int64
 1   target  3263 non-null   int64
dtypes: int64(2)
memory usage: 51.1 KB


In [44]:
submit['target'].value_counts()

0    3263
Name: target, dtype: int64

In [45]:
submit['target'] = pd.DataFrame({'target':predicts})
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3263 non-null   int64
 1   target  3263 non-null   int64
dtypes: int64(2)
memory usage: 51.1 KB


In [46]:
submit['target'].value_counts()

0    2068
1    1195
Name: target, dtype: int64

In [47]:
submit.to_csv('Submits/submit_cleanedNB_TFIDF.csv', index=False)