In [1]:
import pandas as pd
import numpy as np
import spacy
import re, string

In [2]:
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

In [3]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## Lowercasing

In [4]:
# train set
train_df['keyword'] = train_df['keyword'].apply(
    lambda x: str.lower(x) if pd.isna(x) != True else x)

train_df['location'] = train_df['location'].apply(
    lambda x: str.lower(x) if pd.isna(x) != True else x)

train_df['text'] = train_df['text'].apply(lambda x: str.lower(x))

# test set
test_df['keyword'] = test_df['keyword'].apply(
    lambda x: str.lower(x) if pd.isna(x) != True else x)

test_df['location'] = test_df['location'].apply(
    lambda x: str.lower(x) if pd.isna(x) != True else x)

test_df['text'] = test_df['text'].apply(lambda x: str.lower(x))

In [5]:
def remove_entities(text):
    prefixes = ['@', '#']
    for seperator in string.punctuation:
        if seperator not in prefixes:
            text = text.replace(seperator, ' ')
            
    words = []
    for word in text.split():
        word = word.strip()
        if word[0] not in prefixes:
            words.append(word)
    
    return ' '.join(words)

In [6]:
# train set
train_df['keyword'] = train_df['keyword'].apply(
        lambda x: remove_entities(x) if pd.isna(x) != True else x)

train_df['location'] = train_df['location'].apply(
        lambda x: remove_entities(x) if pd.isna(x) != True else x)

train_df['text'] = train_df['text'].apply(lambda x: remove_entities(x))

# test set
test_df['keyword'] = test_df['keyword'].apply(
        lambda x: remove_entities(x) if pd.isna(x) != True else x)

test_df['location'] = test_df['location'].apply(
        lambda x: remove_entities(x) if pd.isna(x) != True else x)

test_df['text'] = test_df['text'].apply(lambda x: remove_entities(x))

In [7]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this may allah for...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13 000 people receive evacuation orders in cal...,1
4,7,,,just got sent this photo from ruby as smoke fr...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1
7609,10870,,,ahrary the out of control wild fires in califo...,1
7610,10871,,,m1 94 01 04 utc 5km s of volcano hawaii http t...,1
7611,10872,,,police investigating after an e bike collided ...,1


## Remove URL links

In [8]:
def url_remover(text):
    text = re.sub(
        r'/(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*         (?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])/igm', ' ', text)
    
    return text

In [9]:
# train set
train_df['keyword'] = train_df['keyword'].apply(
    lambda x: url_remover(x) if pd.isna(x) != True else x)

train_df['location'] = train_df['location'].apply(
    lambda x: url_remover(x) if pd.isna(x) != True else x)

train_df['text'] = train_df['text'].apply(lambda x: url_remover(x))

# test set
test_df['keyword'] = test_df['keyword'].apply(
    lambda x: url_remover(x) if pd.isna(x) != True else x)

test_df['location'] = test_df['location'].apply(
    lambda x: url_remover(x) if pd.isna(x) != True else x)

test_df['text'] = test_df['text'].apply(lambda x: url_remover(x))

In [11]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this may allah for...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13 000 people receive evacuation orders in cal...,1
4,7,,,just got sent this photo from ruby as smoke fr...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1
7609,10870,,,ahrary the out of control wild fires in califo...,1
7610,10871,,,m1 94 01 04 utc 5km s of volcano hawaii http t...,1
7611,10872,,,police investigating after an e bike collided ...,1
