In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from category_encoders import LeaveOneOutEncoder
from category_encoders.wrapper import NestedCVWrapper

In [2]:
train_dataset = pd.read_csv('../dataset/train.csv', index_col=0)
test_dataset = pd.read_csv('../dataset/test.csv', index_col=0)
dataset = pd.concat([test_dataset, train_dataset])
#dataset = dataset.drop('target', axis=1)
dataset.sort_index(inplace=True)
dataset

Unnamed: 0_level_0,keyword,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,,Just happened a terrible car crash
1,,,Our Deeds are the Reason of this #earthquake M...
2,,,"Heard about #earthquake is different cities, s..."
3,,,"there is a forest fire at spot pond, geese are..."
4,,,Forest fire near La Ronge Sask. Canada
...,...,...,...
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
10872,,,Police investigating after an e-bike collided ...
10873,,,The Latest: More Homes Razed by Northern Calif...
10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [3]:
dataset['has_keyword'] = dataset['keyword'].notnull().apply(lambda x: 1 if x else 0)

In [4]:
dataset['keyword'] = dataset['keyword'].fillna('no_keyword')

In [5]:
keyword_count = dataset[['keyword', 'has_keyword']].groupby('keyword').count()
dataset['keyword_count'] = dataset['keyword'].apply(lambda x: keyword_count.loc[x])
keyword_value_count = len(keyword_count)
dataset['keyword_mean'] = dataset['keyword'].apply(lambda x: keyword_count.loc[x] / keyword_value_count)

In [99]:
dataset['keyword_in_text'] = dataset.apply(lambda row: row['keyword'].lower() in row['text'].lower(), axis='columns')

In [100]:
dataset.head(50)

Unnamed: 0_level_0,keyword,location,text,has_keyword,keyword_count,keyword_mean,keyword_in_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,no_keyword,,Just happened a terrible car crash,0,87,0.391892,False
1,no_keyword,,Our Deeds are the Reason of this #earthquake M...,0,87,0.391892,False
2,no_keyword,,"Heard about #earthquake is different cities, s...",0,87,0.391892,False
3,no_keyword,,"there is a forest fire at spot pond, geese are...",0,87,0.391892,False
4,no_keyword,,Forest fire near La Ronge Sask. Canada,0,87,0.391892,False
5,no_keyword,,All residents asked to 'shelter in place' are ...,0,87,0.391892,False
6,no_keyword,,"13,000 people receive #wildfires evacuation or...",0,87,0.391892,False
7,no_keyword,,Just got sent this photo from Ruby #Alaska as ...,0,87,0.391892,False
8,no_keyword,,#RockyFire Update => California Hwy. 20 closed...,0,87,0.391892,False
9,no_keyword,,Apocalypse lighting. #Spokane #wildfires,0,87,0.391892,False


In [80]:
enc_nested = NestedCVWrapper(LeaveOneOutEncoder(cols=['keyword']), random_state=42)

X_train_enc, X_test_enc = enc_nested.fit_transform(train_dataset[['keyword']].fillna("no_keyword"), train_dataset['target'], X_test=test_dataset[['keyword']].fillna("no_keyword"))
X_train_enc = X_train_enc.set_index(train_dataset.index)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [83]:
X_train_enc

Unnamed: 0_level_0,keyword
id,Unnamed: 1_level_1
1,0.660000
4,0.702128
5,0.702128
6,0.702128
7,0.652174
...,...
10869,0.652174
10870,0.652174
10871,0.652174
10872,0.652174


In [79]:
X_test_enc

Unnamed: 0_level_0,keyword
id,Unnamed: 1_level_1
0,0.688525
2,0.688525
3,0.688525
9,0.688525
11,0.688525
...,...
10861,0.688525
10865,0.688525
10868,0.688525
10874,0.688525


In [101]:
features = dataset[['has_keyword',	'keyword_count', 'keyword_mean', 'keyword_in_text']].merge(pd.concat([X_train_enc, X_test_enc]), left_index=True, right_index=True)
features = features.rename(columns={'keyword': 'encoded_keyword'})
features.head(50)

Unnamed: 0_level_0,has_keyword,keyword_count,keyword_mean,keyword_in_text,encoded_keyword
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,87,0.391892,False,0.688525
1,0,87,0.391892,False,0.66
2,0,87,0.391892,False,0.688525
3,0,87,0.391892,False,0.688525
4,0,87,0.391892,False,0.702128
5,0,87,0.391892,False,0.702128
6,0,87,0.391892,False,0.702128
7,0,87,0.391892,False,0.652174
8,0,87,0.391892,False,0.66
9,0,87,0.391892,False,0.688525


In [95]:
features.to_csv('../features/features_basicas_keyword.csv', index=True)