In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import string
import nltk
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
from nltk.stem.porter import *
from sklearn.decomposition import TruncatedSVD

In [2]:
entrenamiento_df = pd.read_csv('Archivos/entrenamiento_df.csv')
entrenamiento_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
test_df = pd.read_csv('Archivos/test_df.csv')
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [4]:
entrenamiento_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [6]:
#Reemplazo valores nulos de keyword por el mas frecuente

imp = SimpleImputer(strategy="most_frequent")
entrenamiento_df['keyword'] = imp.fit_transform(entrenamiento_df[['keyword']])
entrenamiento_df

Unnamed: 0,id,keyword,location,text,target
0,1,fatalities,,Our Deeds are the Reason of this #earthquake M...,1
1,4,fatalities,,Forest fire near La Ronge Sask. Canada,1
2,5,fatalities,,All residents asked to 'shelter in place' are ...,1
3,6,fatalities,,"13,000 people receive #wildfires evacuation or...",1
4,7,fatalities,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,fatalities,,Two giant cranes holding a bridge collapse int...,1
7609,10870,fatalities,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,fatalities,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,fatalities,,Police investigating after an e-bike collided ...,1


In [7]:
test_df['keyword'] = imp.transform(test_df[['keyword']])
test_df

Unnamed: 0,id,keyword,location,text
0,0,fatalities,,Just happened a terrible car crash
1,2,fatalities,,"Heard about #earthquake is different cities, s..."
2,3,fatalities,,"there is a forest fire at spot pond, geese are..."
3,9,fatalities,,Apocalypse lighting. #Spokane #wildfires
4,11,fatalities,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,fatalities,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,fatalities,,Storm in RI worse than last hurricane. My city...
3260,10868,fatalities,,Green Line derailment in Chicago http://t.co/U...
3261,10874,fatalities,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [8]:
#La columna location tiene demasiados nulos, mejor la elimino

entrenamiento_df = entrenamiento_df.drop('location', axis=1)
entrenamiento_df

Unnamed: 0,id,keyword,text,target
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1
2,5,fatalities,All residents asked to 'shelter in place' are ...,1
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,fatalities,Police investigating after an e-bike collided ...,1


In [9]:
test_df = test_df.drop('location', axis=1)
test_df

Unnamed: 0,id,keyword,text
0,0,fatalities,Just happened a terrible car crash
1,2,fatalities,"Heard about #earthquake is different cities, s..."
2,3,fatalities,"there is a forest fire at spot pond, geese are..."
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...


In [10]:
entrenamiento_df['hashtags_count'] = entrenamiento_df['text'].apply(lambda x: x.count('#'))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2
...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0


In [11]:
test_df['hashtags_count'] = test_df['text'].apply(lambda x: x.count('#'))
test_df

Unnamed: 0,id,keyword,text,hashtags_count
0,0,fatalities,Just happened a terrible car crash,0
1,2,fatalities,"Heard about #earthquake is different cities, s...",1
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0
...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0


In [12]:
entrenamiento_df['longitud_text'] = entrenamiento_df['text'].transform(lambda x: len(x))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88
...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137


In [13]:
test_df['longitud_text'] = test_df['text'].transform(lambda x: len(x))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text
0,0,fatalities,Just happened a terrible car crash,0,34
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45
...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65


In [14]:
entrenamiento_df['palabras_count'] = entrenamiento_df['text'].transform(lambda x: len(x.split()))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16
...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19


In [15]:
test_df['palabras_count'] = test_df['text'].transform(lambda x: len(x.split()))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count
0,0,fatalities,Just happened a terrible car crash,0,34,6
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8
...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7


In [16]:
entrenamiento_df['mentions_count'] = entrenamiento_df['text'].apply(lambda x: x.count('@'))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0
...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0


In [17]:
test_df['mentions_count'] = test_df['text'].apply(lambda x: x.count('@'))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count
0,0,fatalities,Just happened a terrible car crash,0,34,6,0
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0
...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0


In [18]:
entrenamiento_df['palabras_unicas_count'] = entrenamiento_df['text'].apply(lambda x: len(set(str(x).split())))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15
...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19


In [19]:
test_df['palabras_unicas_count'] = test_df['text'].apply(lambda x: len(set(str(x).split())))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8
...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7


In [20]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
entrenamiento_df['stopwords_count'] = entrenamiento_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
entrenamiento_df

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/elnic10/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7
...,...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5


In [21]:
test_df['stopwords_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6,2
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2
...,...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0


In [22]:
entrenamiento_df['url_count'] = entrenamiento_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6,0
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0,0
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11,0
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1,0
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7,0
...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2,1
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9,0
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2,1
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5,0


In [23]:
test_df['url_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6,2,0
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2,0
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9,0
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0,0
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2,0
...,...,...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0,0
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6,0
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1,1
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0,1


In [24]:
entrenamiento_df['longitud_palabra_mean'] = entrenamiento_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6,0,4.384615
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0,0,4.571429
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11,0,5.090909
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1,0,7.125000
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7,0,4.500000
...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2,1,6.636364
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9,0,5.300000
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2,1,7.250000
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5,0,6.263158


In [25]:
test_df['longitud_palabra_mean'] = test_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6,2,0,4.833333
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2,0,6.222222
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9,0,4.105263
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0,0,9.250000
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2,0,4.750000
...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0,0,6.000000
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6,0,5.086957
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1,1,8.333333
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0,1,8.428571


In [26]:
entrenamiento_df['punctuation_count'] = entrenamiento_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6,0,4.384615,1
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0,0,4.571429,1
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11,0,5.090909,3
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1,0,7.125000,2
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7,0,4.500000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2,1,6.636364,5
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9,0,5.300000,5
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2,1,7.250000,11
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5,0,6.263158,5


In [27]:
test_df['punctuation_count'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6,2,0,4.833333,0
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2,0,6.222222,3
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9,0,4.105263,2
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0,0,9.250000,3
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2,0,4.750000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0,0,6.000000,0
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6,0,5.086957,5
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1,1,8.333333,5
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0,1,8.428571,7


In [28]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punc(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

In [29]:
slang_abbrev_dict = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'B4N': 'Bye For Now',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FWIW': 'For What It\'s Worth',
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you',
    'ILU': 'I Love You',
    'IMHO': 'In My Humble Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My Ass Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'OIC': 'Oh I See',
    'OMG': 'Oh My God',
    'PITA': 'Pain In The Ass',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'QPSA?': 'Que Pasa?',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My Ass Off',
    'SK8': 'Skate',
    'STATS': 'Your sex and age',
    'ASL': 'Age, Sex, Location',
    'THX': 'Thank You',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'WB': 'Welcome Back',
    'WTF': 'What The Fuck',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'W8': 'Wait',
    '7K': 'Sick:-D Laugher'
}

def unslang(text):
    if text.upper() in slang_abbrev_dict.keys():
        return slang_abbrev_dict[text.upper()]
    else:
        return text

In [30]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

stemmer = PorterStemmer()

def remove_stopwords(text):
    text = [word for word in text if word not in stopwords]
    return text

def stemming(text):
    text = [stemmer.stem(word) for word in text]
    return text

In [31]:
for datas in [entrenamiento_df, test_df]:
    datas['cleaned_text'] = datas['text'].apply(lambda x : remove_url(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_html(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_emoji(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : unslang(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_punc(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : tokenization(x.lower()))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_stopwords(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : stemming(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : ' '.join(x))

entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6,0,4.384615,1,deed reason earthquak may allah forgiv us
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0,0,4.571429,1,forest fire near la rong sask canada
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11,0,5.090909,3,resid ask shelter place notifi offic evacu she...
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1,0,7.125000,2,13000 peopl receiv wildfir evacu order califor...
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7,0,4.500000,2,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,fatalities,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2,1,6.636364,5,two giant crane hold bridg collaps nearbi home
7609,10870,fatalities,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9,0,5.300000,5,ariaahrari thetawniest control wild fire calif...
7610,10871,fatalities,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2,1,7.250000,11,m194 0104 utc5km volcano hawaii
7611,10872,fatalities,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5,0,6.263158,5,polic investig ebik collid car littl portug eb...


In [32]:
test_df

Unnamed: 0,id,keyword,text,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text
0,0,fatalities,Just happened a terrible car crash,0,34,6,0,6,2,0,4.833333,0,happen terribl car crash
1,2,fatalities,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2,0,6.222222,3,heard earthquak differ citi stay safe everyon
2,3,fatalities,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9,0,4.105263,2,forest fire spot pond gees flee across street ...
3,9,fatalities,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0,0,9.250000,3,apocalyps light spokan wildfir
4,11,fatalities,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2,0,4.750000,0,typhoon soudelor kill 28 china taiwan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,fatalities,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0,0,6.000000,0,earthquak safeti lo angel ûò safeti fasten xrwn
3259,10865,fatalities,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6,0,5.086957,5,storm ri wors last hurrican cityamp3oth hardes...
3260,10868,fatalities,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1,1,8.333333,5,green line derail chicago
3261,10874,fatalities,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0,1,8.428571,7,meg issu hazard weather outlook hwo


In [33]:
X = entrenamiento_df.drop(['id', 'text', 'target'], axis = 1)
y = entrenamiento_df['target']
display(X)
display(y)

Unnamed: 0,keyword,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text
0,fatalities,1,69,13,0,13,6,0,4.384615,1,deed reason earthquak may allah forgiv us
1,fatalities,0,38,7,0,7,0,0,4.571429,1,forest fire near la rong sask canada
2,fatalities,0,133,22,0,20,11,0,5.090909,3,resid ask shelter place notifi offic evacu she...
3,fatalities,1,65,8,0,8,1,0,7.125000,2,13000 peopl receiv wildfir evacu order califor...
4,fatalities,2,88,16,0,15,7,0,4.500000,2,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...,...,...,...,...,...,...,...,...
7608,fatalities,0,83,11,0,11,2,1,6.636364,5,two giant crane hold bridg collaps nearbi home
7609,fatalities,0,125,20,2,17,9,0,5.300000,5,ariaahrari thetawniest control wild fire calif...
7610,fatalities,0,65,8,0,8,2,1,7.250000,11,m194 0104 utc5km volcano hawaii
7611,fatalities,0,137,19,0,19,5,0,6.263158,5,polic investig ebik collid car littl portug eb...


0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [34]:
test_df = test_df.drop(['id', 'text'], axis = 1)
test_df

Unnamed: 0,keyword,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text
0,fatalities,0,34,6,0,6,2,0,4.833333,0,happen terribl car crash
1,fatalities,1,64,9,0,9,2,0,6.222222,3,heard earthquak differ citi stay safe everyon
2,fatalities,0,96,19,0,19,9,0,4.105263,2,forest fire spot pond gees flee across street ...
3,fatalities,2,40,4,0,4,0,0,9.250000,3,apocalyps light spokan wildfir
4,fatalities,0,45,8,0,8,2,0,4.750000,0,typhoon soudelor kill 28 china taiwan
...,...,...,...,...,...,...,...,...,...,...,...
3258,fatalities,0,55,8,0,7,0,0,6.000000,0,earthquak safeti lo angel ûò safeti fasten xrwn
3259,fatalities,0,139,23,0,22,6,0,5.086957,5,storm ri wors last hurrican cityamp3oth hardes...
3260,fatalities,0,55,6,0,6,1,1,8.333333,5,green line derail chicago
3261,fatalities,0,65,7,0,7,0,1,8.428571,7,meg issu hazard weather outlook hwo


In [35]:
#One hot encoding para keyword

encoder = OneHotEncoder(sparse = False)
transformed = encoder.fit_transform(X[['keyword']])
onehot_df = pd.DataFrame(transformed, columns = encoder.get_feature_names())
X = pd.concat([X, onehot_df], axis = 1).drop(['keyword'], axis = 1)
X

Unnamed: 0,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text,...,x0_weapons,x0_whirlwind,x0_wild fires,x0_wildfire,x0_windstorm,x0_wounded,x0_wounds,x0_wreck,x0_wreckage,x0_wrecked
0,1,69,13,0,13,6,0,4.384615,1,deed reason earthquak may allah forgiv us,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,38,7,0,7,0,0,4.571429,1,forest fire near la rong sask canada,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,133,22,0,20,11,0,5.090909,3,resid ask shelter place notifi offic evacu she...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,65,8,0,8,1,0,7.125000,2,13000 peopl receiv wildfir evacu order califor...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,88,16,0,15,7,0,4.500000,2,got sent photo rubi alaska smoke wildfir pour ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,83,11,0,11,2,1,6.636364,5,two giant crane hold bridg collaps nearbi home,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7609,0,125,20,2,17,9,0,5.300000,5,ariaahrari thetawniest control wild fire calif...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7610,0,65,8,0,8,2,1,7.250000,11,m194 0104 utc5km volcano hawaii,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7611,0,137,19,0,19,5,0,6.263158,5,polic investig ebik collid car littl portug eb...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
transformed = encoder.transform(test_df[['keyword']])
onehot_df = pd.DataFrame(transformed, columns = encoder.get_feature_names())
test_df = pd.concat([test_df, onehot_df], axis = 1).drop(['keyword'], axis = 1)
test_df

Unnamed: 0,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,cleaned_text,...,x0_weapons,x0_whirlwind,x0_wild fires,x0_wildfire,x0_windstorm,x0_wounded,x0_wounds,x0_wreck,x0_wreckage,x0_wrecked
0,0,34,6,0,6,2,0,4.833333,0,happen terribl car crash,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,64,9,0,9,2,0,6.222222,3,heard earthquak differ citi stay safe everyon,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,96,19,0,19,9,0,4.105263,2,forest fire spot pond gees flee across street ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,40,4,0,4,0,0,9.250000,3,apocalyps light spokan wildfir,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,45,8,0,8,2,0,4.750000,0,typhoon soudelor kill 28 china taiwan,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0,55,8,0,7,0,0,6.000000,0,earthquak safeti lo angel ûò safeti fasten xrwn,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3259,0,139,23,0,22,6,0,5.086957,5,storm ri wors last hurrican cityamp3oth hardes...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3260,0,55,6,0,6,1,1,8.333333,5,green line derail chicago,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3261,0,65,7,0,7,0,1,8.428571,7,meg issu hazard weather outlook hwo,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
#Bag of Words para el texto, reduciendo la dimensionalidad con SVD

vectorizer = CountVectorizer(max_df = 0.5, min_df = 0.001)
count_matrix = vectorizer.fit_transform(X['cleaned_text'])
svd = TruncatedSVD(n_components = 100, random_state = 42)
transformed = svd.fit_transform(count_matrix)
bow_df = pd.DataFrame(transformed)
X = pd.concat([X, bow_df], axis = 1).drop(['cleaned_text'], axis = 1)
X

Unnamed: 0,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,x0_ablaze,...,90,91,92,93,94,95,96,97,98,99
0,1,69,13,0,13,6,0,4.384615,1,0.0,...,0.042085,0.009170,-0.025569,-0.054090,0.090007,-0.044475,0.006980,-0.066704,0.121255,0.167966
1,0,38,7,0,7,0,0,4.571429,1,0.0,...,-0.127851,-0.102480,-0.008068,-0.083326,-0.031990,-0.077703,0.012828,-0.011679,0.123143,0.050880
2,0,133,22,0,20,11,0,5.090909,3,0.0,...,-0.009476,0.055680,0.001517,0.024796,-0.004601,0.119503,0.003549,-0.003830,-0.080904,0.043816
3,1,65,8,0,8,1,0,7.125000,2,0.0,...,0.065633,0.133666,-0.020094,0.027161,0.169223,0.069422,0.065957,-0.087421,-0.103379,0.081976
4,2,88,16,0,15,7,0,4.500000,2,0.0,...,0.057264,0.170978,0.115285,-0.116880,-0.032356,-0.141594,-0.086153,0.178350,0.097697,-0.024190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,83,11,0,11,2,1,6.636364,5,0.0,...,-0.039399,0.087481,-0.130953,-0.070793,-0.049512,0.022383,-0.095802,-0.184718,0.113096,-0.097487
7609,0,125,20,2,17,9,0,5.300000,5,0.0,...,0.178595,0.109997,0.183553,0.051309,0.149375,0.124732,-0.010233,-0.038208,-0.108181,-0.098354
7610,0,65,8,0,8,2,1,7.250000,11,0.0,...,-0.012148,0.031381,0.024795,-0.024511,0.059414,-0.022065,-0.005857,-0.066975,0.035344,0.117223
7611,0,137,19,0,19,5,0,6.263158,5,0.0,...,0.125147,-0.162566,0.253979,0.003558,0.055036,-0.072834,-0.208830,-0.174954,0.058478,-0.118365


In [38]:
count_matrix = vectorizer.transform(test_df['cleaned_text'])
transformed = svd.transform(count_matrix)
bow_df = pd.DataFrame(transformed)
test_df = pd.concat([test_df, bow_df], axis = 1).drop(['cleaned_text'], axis = 1)
test_df

Unnamed: 0,hashtags_count,longitud_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,x0_ablaze,...,90,91,92,93,94,95,96,97,98,99
0,0,34,6,0,6,2,0,4.833333,0,0.0,...,-0.068436,0.041659,0.048278,-0.138354,0.143883,-0.073693,0.031290,0.040446,-0.013780,-0.117345
1,1,64,9,0,9,2,0,6.222222,3,0.0,...,-0.002702,0.027568,0.057547,-0.022631,0.194848,0.062614,0.074146,-0.061645,-0.130015,0.225413
2,0,96,19,0,19,9,0,4.105263,2,0.0,...,-0.104052,0.042090,0.084979,0.014702,-0.155721,-0.004691,0.001273,-0.129142,0.122939,-0.117817
3,2,40,4,0,4,0,0,9.250000,3,0.0,...,0.008102,-0.031915,0.024307,0.050729,0.043847,0.006539,0.098550,-0.088659,0.038176,0.034863
4,0,45,8,0,8,2,0,4.750000,0,0.0,...,0.109335,0.010639,0.053875,0.057294,-0.063591,0.100343,0.082130,-0.124887,0.180426,0.112113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0,55,8,0,7,0,0,6.000000,0,0.0,...,-0.028699,0.033536,-0.008920,-0.058059,0.136809,-0.023200,0.035400,-0.121111,0.059514,0.163377
3259,0,139,23,0,22,6,0,5.086957,5,0.0,...,0.136332,0.271629,-0.179611,0.317913,-0.202079,0.014635,0.324979,-0.207263,0.029508,-0.248993
3260,0,55,6,0,6,1,1,8.333333,5,0.0,...,-0.020520,0.014194,0.034448,-0.078340,-0.001244,0.050648,-0.022320,-0.029590,-0.023879,0.056699
3261,0,65,7,0,7,0,1,8.428571,7,0.0,...,0.160399,-0.045721,0.027293,-0.045845,0.013197,-0.101579,-0.267951,0.114358,0.095710,-0.159530


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

eval_set = [(X_train, y_train), (X_test, y_test)]

params = {
    'learning_rate':0.1,
    'n_estimators':1000,
    'max_depth':5,
    'min_child_weight':1,
    'gamma':0,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'objective':'binary:logistic',
    'nthread':4,
    'scale_pos_weight':1,
    'seed':27
}

#Busco la cantidad optima de arboles

xgboost = XGBClassifier(**params)

xgboost.fit(X_train, y_train, eval_set = eval_set, early_stopping_rounds = 50)

print('Train Score', xgboost.score(X_train, y_train))
print('Test Score', xgboost.score(X_test, y_test))

[0]	validation_0-error:0.25539	validation_1-error:0.27731
Multiple eval metrics have been passed: 'validation_1-error' will be used for early stopping.

Will train until validation_1-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.23507	validation_1-error:0.27048
[2]	validation_0-error:0.23261	validation_1-error:0.25840
[3]	validation_0-error:0.22543	validation_1-error:0.24580
[4]	validation_0-error:0.21405	validation_1-error:0.25052
[5]	validation_0-error:0.21124	validation_1-error:0.24895
[6]	validation_0-error:0.20424	validation_1-error:0.24212
[7]	validation_0-error:0.20529	validation_1-error:0.24422
[8]	validation_0-error:0.19653	validation_1-error:0.23582
[9]	validation_0-error:0.19863	validation_1-error:0.23372
[10]	validation_0-error:0.19828	validation_1-error:0.23319
[11]	validation_0-error:0.19180	validation_1-error:0.23582
[12]	validation_0-error:0.19495	validation_1-error:0.23739
[13]	validation_0-error:0.19040	validation_1-error:0.24107
[14]	validation_0-error

In [40]:
n_estimators = xgboost.get_booster().best_ntree_limit
params['n_estimators'] = n_estimators
n_estimators

65

In [41]:
%%time

param_test = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
cv = GridSearchCV(estimator = XGBClassifier(**params), param_grid = param_test,\
                  n_jobs=-1, scoring = 'f1')
cv.fit(X_train,y_train)
pd.DataFrame(cv.cv_results_), cv.best_params_, cv.best_score_

CPU times: user 8.65 s, sys: 297 ms, total: 8.94 s
Wall time: 2min 30s


(    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0        6.477475      0.170623         0.108559        0.034652   
 1        6.014943      0.174778         0.093745        0.011238   
 2        5.293064      0.568080         0.076185        0.019792   
 3        7.741161      0.219140         0.065270        0.008012   
 4        6.057823      0.434550         0.070459        0.009106   
 5        7.437628      0.719098         0.075873        0.002381   
 6       12.071527      0.664648         0.077902        0.007664   
 7       11.609182      0.623155         0.067920        0.012581   
 8       10.771276      1.274534         0.075311        0.006240   
 9       16.131578      0.886634         0.094913        0.013974   
 10      13.953627      1.361337         0.079884        0.005316   
 11      10.053705      2.428238         0.053736        0.018074   
 
    param_max_depth param_min_child_weight  \
 0                3                      1   
 1       

In [42]:
%%time

param_test = {
 'max_depth':[6,7,8],
 'min_child_weight':[1,2]
}
cv = GridSearchCV(estimator = XGBClassifier(**params), param_grid = param_test,\
                        n_jobs=-1, scoring = 'f1')
cv.fit(X_train,y_train)
pd.DataFrame(cv.cv_results_), cv.best_params_, cv.best_score_

CPU times: user 8.65 s, sys: 96.4 ms, total: 8.75 s
Wall time: 1min 26s


(   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0       9.617438      0.595955         0.072664        0.010980   
 1       9.805320      0.639467         0.080722        0.014102   
 2      12.012328      0.660455         0.074305        0.004494   
 3      11.018944      0.788283         0.078716        0.003688   
 4      12.630615      0.795383         0.074334        0.004190   
 5      10.260005      2.790450         0.051130        0.022231   
 
   param_max_depth param_min_child_weight  \
 0               6                      1   
 1               6                      2   
 2               7                      1   
 3               7                      2   
 4               8                      1   
 5               8                      2   
 
                                     params  split0_test_score  \
 0  {'max_depth': 6, 'min_child_weight': 1}           0.693333   
 1  {'max_depth': 6, 'min_child_weight': 2}           0.703003   
 2  {'m

In [43]:
params['max_depth'] = 7
params['min_child_weight'] = 2

In [44]:
%%time

param_test = {
 'gamma':[i/10.0 for i in range(0,5)]
}
cv = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=n_estimators, max_depth=7,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test, n_jobs=-1, scoring = 'f1')
cv.fit(X_train,y_train)
pd.DataFrame(cv.cv_results_), cv.best_params_, cv.best_score_

CPU times: user 8.64 s, sys: 63.5 ms, total: 8.71 s
Wall time: 1min 9s


(   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_gamma  \
 0      10.734821      0.733003         0.082224        0.006439           0   
 1      10.813193      0.651469         0.074723        0.004265         0.1   
 2      10.791720      1.250061         0.070501        0.007258         0.2   
 3       9.955178      1.487613         0.075962        0.004127         0.3   
 4       8.903181      2.387891         0.053421        0.021269         0.4   
 
            params  split0_test_score  split1_test_score  split2_test_score  \
 0  {'gamma': 0.0}           0.715556           0.729387           0.734289   
 1  {'gamma': 0.1}           0.714127           0.728033           0.741474   
 2  {'gamma': 0.2}           0.706667           0.728814           0.739274   
 3  {'gamma': 0.3}           0.705100           0.725367           0.737991   
 4  {'gamma': 0.4}           0.707965           0.734258           0.743229   
 
    split3_test_score  split4_test_score  

In [45]:
%%time

#Vuelvo a buscar el numero de arboles optimo para los nuevos hiperparametros
params['n_estimators'] = 1000

xgboost = XGBClassifier(**params)

xgboost.fit(X_train, y_train,eval_set = eval_set, early_stopping_rounds = 50)

print('Train Score', xgboost.score(X_train, y_train))
print('Test Score', xgboost.score(X_test, y_test))

[0]	validation_0-error:0.21895	validation_1-error:0.26365
Multiple eval metrics have been passed: 'validation_1-error' will be used for early stopping.

Will train until validation_1-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.19653	validation_1-error:0.26155
[2]	validation_0-error:0.19040	validation_1-error:0.24842
[3]	validation_0-error:0.18042	validation_1-error:0.23897
[4]	validation_0-error:0.17008	validation_1-error:0.23057
[5]	validation_0-error:0.16430	validation_1-error:0.23162
[6]	validation_0-error:0.15940	validation_1-error:0.23372
[7]	validation_0-error:0.15327	validation_1-error:0.22637
[8]	validation_0-error:0.15257	validation_1-error:0.22637
[9]	validation_0-error:0.14854	validation_1-error:0.22059
[10]	validation_0-error:0.14889	validation_1-error:0.22006
[11]	validation_0-error:0.14749	validation_1-error:0.21849
[12]	validation_0-error:0.14486	validation_1-error:0.21691
[13]	validation_0-error:0.14276	validation_1-error:0.21954
[14]	validation_0-error

In [46]:
n_estimators = xgboost.get_booster().best_ntree_limit
params['n_estimators'] = n_estimators
n_estimators

90

In [47]:
%%time

param_test = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

cv = GridSearchCV(estimator = XGBClassifier(**params), param_grid = param_test,\
                  n_jobs=-1, scoring = 'f1')
cv.fit(X_train,y_train)
pd.DataFrame(cv.cv_results_), cv.best_params_, cv.best_score_

CPU times: user 9.25 s, sys: 153 ms, total: 9.4 s
Wall time: 5min


(    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0        9.643356      2.382829         0.080233        0.007758   
 1       12.833365      1.621034         0.074400        0.010514   
 2       13.934335      0.935766         0.091235        0.022558   
 3       13.520961      1.252291         0.074003        0.009601   
 4       13.709084      1.993249         0.079113        0.005034   
 5       13.968434      1.195695         0.077733        0.012731   
 6       15.648760      0.885464         0.075828        0.012601   
 7       15.039387      1.174774         0.077131        0.007746   
 8       16.227426      0.925759         0.076019        0.002261   
 9       15.482177      1.270868         0.078821        0.007623   
 10      16.482877      1.061980         0.067182        0.005339   
 11      15.906068      0.758879         0.079616        0.010872   
 12      16.290434      0.380233         0.079688        0.008072   
 13      16.067984      0.264690  

In [48]:
%%time

param_test = {
 'colsample_bytree':[i/10.0 for i in range(4,7)],
 'subsample':[i/100.0 for i in range(85,100,5)]
}

cv = GridSearchCV(estimator = XGBClassifier(**params), param_grid = param_test,\
                  n_jobs=-1, scoring = 'f1')
cv.fit(X_train,y_train)
pd.DataFrame(cv.cv_results_), cv.best_params_, cv.best_score_

CPU times: user 7.85 s, sys: 154 ms, total: 8.01 s
Wall time: 2min 3s


(   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0       9.387490      1.633358         0.080788        0.009120   
 1       8.681468      0.579147         0.075009        0.010359   
 2       9.078828      0.864635         0.082587        0.009509   
 3      10.900328      1.067851         0.077966        0.008577   
 4      10.970867      0.630838         0.081556        0.004846   
 5      10.777778      0.974785         0.093706        0.021471   
 6      12.257357      0.847964         0.075163        0.005707   
 7      11.903859      0.720983         0.079391        0.016700   
 8      11.458119      1.626601         0.056121        0.024680   
 
   param_colsample_bytree param_subsample  \
 0                    0.4            0.85   
 1                    0.4             0.9   
 2                    0.4            0.95   
 3                    0.5            0.85   
 4                    0.5             0.9   
 5                    0.5            0.95   
 6 

In [49]:
params['subsample'] = 0.85

In [50]:
%%time

param_test = {
 'colsample_bytree':[i/100.0 for i in range(45,60,5)]
}

cv = GridSearchCV(estimator = XGBClassifier(**params), param_grid = param_test,\
                  n_jobs=-1, scoring = 'f1')
cv.fit(X_train,y_train)
pd.DataFrame(cv.cv_results_), cv.best_params_, cv.best_score_

CPU times: user 7.75 s, sys: 81.8 ms, total: 7.83 s
Wall time: 45.7 s


(   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0      10.475254      0.784338         0.078667        0.006653   
 1      11.854054      0.325791         0.077756        0.010011   
 2      11.252456      3.050187         0.049751        0.020028   
 
   param_colsample_bytree                      params  split0_test_score  \
 0                   0.45  {'colsample_bytree': 0.45}           0.711111   
 1                    0.5   {'colsample_bytree': 0.5}           0.718404   
 2                   0.55  {'colsample_bytree': 0.55}           0.711454   
 
    split1_test_score  split2_test_score  split3_test_score  split4_test_score  \
 0           0.732484           0.717778           0.690187           0.715217   
 1           0.741214           0.735327           0.693391           0.713666   
 2           0.726305           0.731654           0.688742           0.700431   
 
    mean_test_score  std_test_score  rank_test_score  
 0         0.713356        0.013643

In [51]:
params['colsample_bytree'] = 0.5

In [52]:
%%time

#Vemos los resultados con los hiperparametros tuneados
params['n_estimators'] = 1000

xgboost = XGBClassifier(**params)

xgboost.fit(X_train, y_train, eval_set = eval_set, early_stopping_rounds = 50)

print('Train Score', xgboost.score(X_train, y_train))
print('Test Score', xgboost.score(X_test, y_test))

[0]	validation_0-error:0.23594	validation_1-error:0.28309
Multiple eval metrics have been passed: 'validation_1-error' will be used for early stopping.

Will train until validation_1-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.20354	validation_1-error:0.26786
[2]	validation_0-error:0.19688	validation_1-error:0.26418
[3]	validation_0-error:0.18497	validation_1-error:0.24527
[4]	validation_0-error:0.18059	validation_1-error:0.24370
[5]	validation_0-error:0.16921	validation_1-error:0.24527
[6]	validation_0-error:0.16588	validation_1-error:0.23739
[7]	validation_0-error:0.16097	validation_1-error:0.23372
[8]	validation_0-error:0.15729	validation_1-error:0.22637
[9]	validation_0-error:0.15011	validation_1-error:0.22899
[10]	validation_0-error:0.14976	validation_1-error:0.22899
[11]	validation_0-error:0.14591	validation_1-error:0.22742
[12]	validation_0-error:0.14346	validation_1-error:0.22269
[13]	validation_0-error:0.14136	validation_1-error:0.22216
[14]	validation_0-error

In [53]:
n_estimators = xgboost.get_booster().best_ntree_limit
params['n_estimators'] = n_estimators
n_estimators

78

In [54]:
xgboost = XGBClassifier(**params)

xgboost.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=2, missing=nan, monotone_constraints='()',
              n_estimators=78, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=27, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=27, subsample=0.85, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [55]:
predicts = xgboost.predict(test_df)

In [56]:
submit = pd.read_csv('setDeDatos/nlp-getting-started/sample_submission.csv')
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3263 non-null   int64
 1   target  3263 non-null   int64
dtypes: int64(2)
memory usage: 51.1 KB


In [57]:
submit['target'].value_counts()

0    3263
Name: target, dtype: int64

In [58]:
submit['target'] = pd.DataFrame({'target':predicts})
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3263 non-null   int64
 1   target  3263 non-null   int64
dtypes: int64(2)
memory usage: 51.1 KB


In [59]:
submit['target'].value_counts()

0    2098
1    1165
Name: target, dtype: int64

In [60]:
submit.to_csv('Submits/submit_bow_xgboost.csv', index=False)