In [1]:
import os
import pandas as pd
import re

DATA_PATH = "data/raw"

In [3]:
os.chdir("./../")

In [4]:
df_train = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
df_test = pd.read_csv(os.path.join(DATA_PATH, "test.csv"))

print ("Train set:", df_train.shape)
print ("Test set:", df_test.shape)

df_train.tail()

Train set: (7613, 5)
Test set: (3263, 4)


Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


## Keyword

In [7]:
def clean_keyword(keyword: str) -> str:
    if not isinstance(keyword, str):
        return None

    return keyword.replace("%20", " ").lower()

df_train["keyword"] = df_train["keyword"].apply(clean_keyword)
df_test["keyword"] = df_test["keyword"].apply(clean_keyword)

In [21]:
df_train#["keyword"].unique()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [25]:
df_train["location"].unique()

array([nan, 'Birmingham', 'Est. September 2012 - Bristol', ...,
       'Vancouver, Canada', 'London ', 'Lincoln'], dtype=object)

In [8]:
df_train[~df_train["location"].isna()]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...,...
7575,10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0t...,0
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thoug...,0
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words tha...,0


In [9]:
def get_text_tags(text: str, regex=r"#") -> list:
    return re.findall(regex, text)


text = 'HAPPENING NOW - HATZOLAH EMS AMBULANCE RESPONDING WITH DUAL SIRENS AND\x89Û_ https://t.co/SeK6MQ6NJF https://t.co/SeK6MQ6NJF'
get_text_tags(text, regex=r"https://t.co/\w+")

['https://t.co/SeK6MQ6NJF', 'https://t.co/SeK6MQ6NJF']

In [10]:
regex = r"@\w+"

df_train["profile_tags"] = df_train["text"].apply(lambda x: get_text_tags(x, regex=regex))
df_test["profile_tags"] = df_train["text"].apply(lambda x: get_text_tags(x, regex=regex))
df_train[["text", "profile_tags"]]

Unnamed: 0,text,profile_tags
0,Our Deeds are the Reason of this #earthquake M...,[]
1,Forest fire near La Ronge Sask. Canada,[]
2,All residents asked to 'shelter in place' are ...,[]
3,"13,000 people receive #wildfires evacuation or...",[]
4,Just got sent this photo from Ruby #Alaska as ...,[]
...,...,...
7608,Two giant cranes holding a bridge collapse int...,[]
7609,@aria_ahrary @TheTawniest The out of control w...,"[@aria_ahrary, @TheTawniest]"
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,[]
7611,Police investigating after an e-bike collided ...,[]


In [11]:
regex = r"#\w+"
df_train["hash_tags"] = df_train["text"].apply(lambda x: get_text_tags(x, regex=regex))
df_test["hash_tags"] = df_train["text"].apply(lambda x: get_text_tags(x, regex=regex))
df_train[["text", "hash_tags"]]

Unnamed: 0,text,hash_tags
0,Our Deeds are the Reason of this #earthquake M...,[#earthquake]
1,Forest fire near La Ronge Sask. Canada,[]
2,All residents asked to 'shelter in place' are ...,[]
3,"13,000 people receive #wildfires evacuation or...",[#wildfires]
4,Just got sent this photo from Ruby #Alaska as ...,"[#Alaska, #wildfires]"
...,...,...
7608,Two giant cranes holding a bridge collapse int...,[]
7609,@aria_ahrary @TheTawniest The out of control w...,[]
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,[]
7611,Police investigating after an e-bike collided ...,[]


In [12]:
regex = r"https://t.co/\w+"
df_train["link_tags"] = df_train["text"].apply(lambda x: get_text_tags(x, regex=regex))
df_test["link_tags"] = df_train["text"].apply(lambda x: get_text_tags(x, regex=regex))
# df_train[["text", "link_tags"]].loc[200]

In [13]:
def count_character(text: str, character: str) -> int:
    return len([char for char in str(text) if char == character])

count_character("Oh my god!!!", "!")

3

In [14]:
df_test["question_marks"] = df_test["text"].apply(lambda x: count_character(x, "?"))
df_test["question_marks"] = df_test["text"].apply(lambda x: count_character(x, "?"))
df_test

Unnamed: 0,id,keyword,location,text,profile_tags,hash_tags,link_tags,question_marks
0,0,,,Just happened a terrible car crash,[],[#earthquake],[],0
1,2,,,"Heard about #earthquake is different cities, s...",[],[],[],0
2,3,,,"there is a forest fire at spot pond, geese are...",[],[],[],0
3,9,,,Apocalypse lighting. #Spokane #wildfires,[],[#wildfires],[],0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,[],"[#Alaska, #wildfires]",[],0
...,...,...,...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,[],[],[],0
3259,10865,,,Storm in RI worse than last hurricane. My city...,[],[],[],0
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,[],[],[],0
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,[],"[#ProphetMuhammad, #islam]",[],0


In [15]:
df_train["text"][200]

'HAPPENING NOW - HATZOLAH EMS AMBULANCE RESPONDING WITH DUAL SIRENS AND\x89Û_ https://t.co/SeK6MQ6NJF'