### Data cleaning pipeline Demo

Import preprocessed sample dataset

In [1]:
import pandas as pd

df_prepro = pd.read_csv('../data/sample/corona_sample.csv', index_col = 0)

Clean the data with the following steps (in sequence):
- Remove username
- Remove emoticon
- Remove url
- Remove html
- Remove stopwords
- Perform lemmatisation
- Remove unknown words (including acronyms)
- Remove tweets that are shorter than minimum length
- Remove duplicated rows (based on the tweet text)
- Convert the created_date to date_time format

In [2]:
import re

def remove_username(entry):
    pattern = r'@.+?\s'
    output = re.sub(pattern, '', entry).strip()
    return output

df_prepro['tweet'] = df_prepro.tweet.apply(remove_username)

In [3]:
def remove_emoticon(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

df_prepro['tweet'] = df_prepro.tweet.apply(remove_emoticon)

In [4]:
def remove_url(entry):
    pattern = r'http\S+'
    output = re.sub(pattern, '', entry).strip()
    return output

df_prepro['tweet'] = df_prepro.tweet.apply(remove_url)

In [5]:
def remove_html(entry):
    pattern = r'<.+?>'
    output = re.sub(pattern, '', entry).strip()
    return output

df_prepro['tweet'] = df_prepro.tweet.apply(remove_html)

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_word = set(stopwords.words('english'))

def remove_stop_word(entry):    
    output = [w for w in word_tokenize(entry) if w.lower() not in stop_word]
    output = ' '.join(output)
    return output

df_prepro['tweet'] = df_prepro.tweet.apply(remove_stop_word)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') 

lemmatizer = WordNetLemmatizer()

def lemmat(entry):
    tokens = [word for word in word_tokenize(entry.lower()) if (word.isalpha() or word.isnumeric())]
    tags = nltk.pos_tag_sents([tokens])
    output = []
    for i,tk in enumerate(tokens):
        tag = tags[0][i][1][0]
        try:
            word = lemmatizer.lemmatize(tk, pos=tag.lower())
        except KeyError:
            word = tk
        output.append(word)
    output = ' '.join(output)
    return output

df_prepro['tweet'] = df_prepro.tweet.apply(lemmat)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import words
nltk.download('words')

english_word = set(words.words())

def remove_unknown_word(entry):    
    output = [w for w in word_tokenize(entry) if w.lower() in english_word]
    output = ' '.join(output)
    return output

df_prepro['tweet'] = df_prepro.tweet.apply(remove_unknown_word)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [9]:
long_tweet = df_prepro['tweet'].str.len() > 33
df_prepro = df_prepro[long_tweet]

In [10]:
df_prepro = df_prepro.drop_duplicates(subset='tweet', keep='first')

In [11]:
df_prepro.loc[:,'created_date'] = pd.to_datetime(df_prepro['created_date'])

Clean sample dataset have 7277 rows

In [12]:
df_prepro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7277 entries, 0 to 38216
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   created_date  7277 non-null   datetime64[ns, UTC]
 1   tweet         7277 non-null   object             
 2   tweet_id      7277 non-null   int64              
 3   sentiment     7277 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1), object(1)
memory usage: 284.3+ KB


In [11]:
df_prepro.head(100)

Unnamed: 0,created_date,tweet,tweet_id,sentiment
0,Thu Mar 19 19:52:18 +0000 2020,lot would actually benefit take good look intr...,1240727821028405249,0.350000
2,Thu Mar 19 19:52:16 +0000 2020,somewhere studio sing body dey kill person like,1240727810249043969,0.000000
3,Thu Mar 19 19:52:16 +0000 2020,appreciate move fight corona never fan hater j,1240727809300901888,0.000000
4,Thu Mar 19 19:52:17 +0000 2020,everything fine world decide take premier leag...,1240727817555447808,0.416667
5,Thu Mar 19 19:52:18 +0000 2020,here delete scene special corona virus,1240727819119968256,0.357143
...,...,...,...,...
144,Thu Mar 19 19:52:19 +0000 2020,seriously call stupid come country large numbe...,1240727822089351168,-0.263095
145,Thu Mar 19 19:52:20 +0000 2020,new twist classic greater good queen,1240727828989165571,0.375758
148,Thu Mar 19 19:52:20 +0000 2020,generation z want name folk love call everybody,1240727829668560903,0.500000
151,Thu Mar 19 19:52:20 +0000 2020,please team researcher look volunteer translat...,1240727828989202437,0.500000


### Validation of the data cleaning pipeline script
'cd ..' is to make relative import work for jupyter notebook

In [14]:
cd ..

C:\Users\simon\OneDrive\Desktop\team5\team5


Load preprocessed sample dataset and perform cleaning with data cleaning script

In [15]:
import pandas as pd
from src.datapipeline.clean_data import clean_pipeline

df_prepro = pd.read_csv('data/sample/corona_sample.csv', index_col = 0)

df_clean = clean_pipeline(df_prepro, min_length=33)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\simon\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


The dataset cleaned by the script also have 7277 rows

In [16]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7277 entries, 0 to 38216
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   created_date  7277 non-null   datetime64[ns, UTC]
 1   tweet         7277 non-null   object             
 2   tweet_id      7277 non-null   int64              
 3   sentiment     7277 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(1), object(1)
memory usage: 284.3+ KB


In [17]:
df_clean.head(100)

Unnamed: 0,created_date,tweet,tweet_id,sentiment
0,2020-03-19 19:52:18+00:00,lot would actually benefit take good look intr...,1240727821028405249,0.350000
2,2020-03-19 19:52:16+00:00,somewhere studio sing body dey kill person like,1240727810249043969,0.000000
3,2020-03-19 19:52:16+00:00,appreciate move fight corona never fan hater j,1240727809300901888,0.000000
4,2020-03-19 19:52:17+00:00,everything fine world decide take premier leag...,1240727817555447808,0.416667
5,2020-03-19 19:52:18+00:00,here delete scene special corona virus,1240727819119968256,0.357143
...,...,...,...,...
227,2020-03-19 19:52:23+00:00,interest get name spiked ring protein surface ...,1240727839848181761,0.500000
229,2020-03-19 19:52:22+00:00,share corona virus vaccine owe life,1240727836899430405,0.000000
233,2020-03-19 19:52:22+00:00,inform say corona virus die degree guess turn ...,1240727835091841026,0.000000
235,2020-03-19 19:52:22+00:00,burnt as really think various conspiracy coron...,1240727836421492738,0.100000
