In [1]:
# Load the "autoreload" extension so that code can change
%load_ext autoreload

# Always reload modules
%autoreload 2

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import nltk
import os
import pandas as pd
import re

In [3]:
# Get stop words
nltk.download('stopwords')
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eyosyasdagnachew/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def preprocess_text(text):
    """ Given a piece of text (a tweet), preprocesses text according to paper.
    
    Text preprocessing steps:
    - Remove stop words
    - Remove non-ASCII characters
    - Remove numbers
    - Remove URLs
    - Remove hashtags
    - Replace all punctuation marks with white-spaces

    NOTE: I would do more preprocessing (e.g. remove 'RT' and the mentioned account in retweets)
          but this is the only thing mentioned in the paper.
    """

    # Convert text to lowercase
    cleaned_text = text.lower()

    # Remove non-ASCII characters 
    cleaned_text = cleaned_text.encode(encoding="ascii", errors="ignore").decode()

    # Remove numbers
    cleaned_text = re.sub(r"[0-9]", "", cleaned_text)

    # Remove URLs
    cleaned_text = re.sub(r'http\S+', '', cleaned_text)

    # Remove hashtag signs
    cleaned_text = cleaned_text.replace("#", "")

    # Replace all punctuation marks with white-spaces
    cleaned_text = re.sub(r"[,.:;@#?!&$]+\ *", " ", cleaned_text)

    # Remove stop words
    tokenized_text = TweetTokenizer().tokenize(cleaned_text)
    cleaned_text = " ".join([token for token in tokenized_text if token not in stop_words])

    return cleaned_text

In [10]:
humanitarian_train_filepath = "../data/raw/crisismmd_datasplit_agreed_label/task_humanitarian_text_img_agreed_lab_train.tsv"
humanitarian_train_df = pd.read_csv(humanitarian_train_filepath, sep="\t")
humanitarian_train_df

Unnamed: 0,event_name,tweet_id,image_id,tweet_text,image,label,label_text,label_image,label_text_image
0,california_wildfires,917793137925459968,917793137925459968_0,RT @KAKEnews: California wildfires destroy mor...,data_image/california_wildfires/10_10_2017/917...,infrastructure_and_utility_damage,infrastructure_and_utility_damage,infrastructure_and_utility_damage,Positive
1,california_wildfires,917793137925459968,917793137925459968_1,RT @KAKEnews: California wildfires destroy mor...,data_image/california_wildfires/10_10_2017/917...,infrastructure_and_utility_damage,infrastructure_and_utility_damage,infrastructure_and_utility_damage,Positive
2,california_wildfires,917793137925459968,917793137925459968_2,RT @KAKEnews: California wildfires destroy mor...,data_image/california_wildfires/10_10_2017/917...,infrastructure_and_utility_damage,infrastructure_and_utility_damage,infrastructure_and_utility_damage,Positive
3,california_wildfires,917815040962695168,917815040962695168_2,RT @TheAtlantic: Photos of California's destru...,data_image/california_wildfires/10_10_2017/917...,infrastructure_and_utility_damage,infrastructure_and_utility_damage,infrastructure_and_utility_damage,Positive
4,california_wildfires,917828283047260161,917828283047260161_0,Why California's #wildfires are worse in the f...,data_image/california_wildfires/10_10_2017/917...,other_relevant_information,other_relevant_information,other_relevant_information,Positive
...,...,...,...,...,...,...,...,...,...
6121,hurricane_maria,922480484227088384,922480484227088384_0,Who’s on The One Show tonight? Sir David Jason...,data_image/hurricane_maria/23_10_2017/92248048...,not_humanitarian,not_humanitarian,not_humanitarian,Positive
6122,hurricane_maria,922929267247169536,922929267247169536_0,@MMFlint ....Trump and Friends find $$$ in Pue...,data_image/hurricane_maria/24_10_2017/92292926...,not_humanitarian,not_humanitarian,not_humanitarian,Positive
6123,hurricane_harvey,908172698790055936,908172698790055936_0,The song Donna and Harvey kissed to á½ D #Darv...,data_image/hurricane_harvey/14_9_2017/90817269...,not_humanitarian,not_humanitarian,not_humanitarian,Positive
6124,hurricane_harvey,906790326215614466,906790326215614466_0,Atlantic Hrcne Season 2017 Major Hrcne Irma Ca...,data_image/hurricane_harvey/10_9_2017/90679032...,other_relevant_information,other_relevant_information,other_relevant_information,Positive


In [8]:
humanitarian_train_df["preprocessed_text"] = humanitarian_train_df["tweet_text"].apply(lambda x: preprocess_text(str(x)))
humanitarian_train_df

Unnamed: 0,event_name,tweet_id,image_id,tweet_text,image,label,label_text,label_image,label_text_image,preprocessed_text
0,california_wildfires,917793137925459968,917793137925459968_0,RT @KAKEnews: California wildfires destroy mor...,data_image/california_wildfires/10_10_2017/917...,infrastructure_and_utility_damage,infrastructure_and_utility_damage,infrastructure_and_utility_damage,Positive,rt kakenews california wildfires destroy struc...
1,california_wildfires,917793137925459968,917793137925459968_1,RT @KAKEnews: California wildfires destroy mor...,data_image/california_wildfires/10_10_2017/917...,infrastructure_and_utility_damage,infrastructure_and_utility_damage,infrastructure_and_utility_damage,Positive,rt kakenews california wildfires destroy struc...
2,california_wildfires,917793137925459968,917793137925459968_2,RT @KAKEnews: California wildfires destroy mor...,data_image/california_wildfires/10_10_2017/917...,infrastructure_and_utility_damage,infrastructure_and_utility_damage,infrastructure_and_utility_damage,Positive,rt kakenews california wildfires destroy struc...
3,california_wildfires,917815040962695168,917815040962695168_2,RT @TheAtlantic: Photos of California's destru...,data_image/california_wildfires/10_10_2017/917...,infrastructure_and_utility_damage,infrastructure_and_utility_damage,infrastructure_and_utility_damage,Positive,rt theatlantic photos california's destructive...
4,california_wildfires,917828283047260161,917828283047260161_0,Why California's #wildfires are worse in the f...,data_image/california_wildfires/10_10_2017/917...,other_relevant_information,other_relevant_information,other_relevant_information,Positive,california's wildfires worse fall
...,...,...,...,...,...,...,...,...,...,...
6121,hurricane_maria,922480484227088384,922480484227088384_0,Who’s on The One Show tonight? Sir David Jason...,data_image/hurricane_maria/23_10_2017/92248048...,not_humanitarian,not_humanitarian,not_humanitarian,Positive,whos one show tonight sir david jason joins al...
6122,hurricane_maria,922929267247169536,922929267247169536_0,@MMFlint ....Trump and Friends find $$$ in Pue...,data_image/hurricane_maria/24_10_2017/92292926...,not_humanitarian,not_humanitarian,not_humanitarian,Positive,mmflint trump friends find puerto rico trumpra...
6123,hurricane_harvey,908172698790055936,908172698790055936_0,The song Donna and Harvey kissed to á½ D #Darv...,data_image/hurricane_harvey/14_9_2017/90817269...,not_humanitarian,not_humanitarian,not_humanitarian,Positive,song donna harvey kissed darvey perfectmoment ...
6124,hurricane_harvey,906790326215614466,906790326215614466_0,Atlantic Hrcne Season 2017 Major Hrcne Irma Ca...,data_image/hurricane_harvey/10_9_2017/90679032...,other_relevant_information,other_relevant_information,other_relevant_information,Positive,atlantic hrcne season major hrcne irma cat mia...


In [9]:
humanitarian_train_df.iloc[8]["preprocessed_text"]

'playing new friend chai california fire evacuee starting us days'