In [67]:
%run package_import.ipynb

In [50]:
from sqlalchemy import create_engine
import pymysql

sqlEngine = create_engine('mysql+pymysql://root:@127.0.0.1/nlp_with_disaster_tweets', pool_recycle=3600)
dbConnection = sqlEngine.connect()

#### Data cleansing

#### combine train and test data for the following data cleansing process

In [51]:
df_combined = pd.read_sql_query(

'''
select *
    ,'train' as source_file
from raw_train
union all
select *
    ,Null as target
    ,'test' as source_file
from raw_test

'''
,dbConnection)
df_combined.shape

(10876, 6)

#### remove urls

In [52]:
def remove_url(text: str) -> str:
    '''
    input:
    text: text where urls are to be removed

    output:
    return text excluding urls
    '''

    url = re.compile(r'https?://\S+|www\.\S+')
    
    return url.sub(r'', text)

In [53]:
df_combined['text_transformed'] = df_combined['text'].apply(lambda x: remove_url(x))

#### remove html tags

In [54]:
def remove_html(text:str) -> str:
    '''
    input:
    text: text where html tags are to be removed

    output:
    return text where html tags are removed
    '''

    html = re.compile(r'<.*?>')

    return html.sub(r'', text)


In [55]:
df_combined['text_transformed'] = df_combined['text_transformed'].apply(lambda x: remove_html(x))

#### Romoving Emojis

In [56]:
def remove_emojis(text: str) -> str:
    '''
    input:
    text: text where emojis are to be removed

    output:
    return text where emojis are removed
    '''

    emojis = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

    return emojis.sub(r'', text)

In [58]:
df_combined['text_transformed'] = df_combined['text_transformed'].apply(lambda x: remove_emojis(x))

#### remove punctuations

In [61]:
def remove_punct(text: str) -> str:
    '''
    input:
    text: text where punctuation are to be removed

    output:
    return text where punctuation are removed
    '''

    punc = str.maketrans('', '', string.punctuation)

    return text.translate(punc)


In [62]:
df_combined['text_transformed'] = df_combined['text_transformed'].apply(lambda x: remove_punct(x))

#### spelling correction 

In [68]:
spell = SpellChecker()

def correct_spellings(text: str) -> str:
    '''
    '''
    correct_text = []
    words = text.split()
    miss_spelling = spell.unknown(words)
    for word in words:
        if word in miss_spelling:
            correct_text.append(spell.correction(word))
        else:
            correct_text.append(word)
    
    return ' '.join(correct_text)

In [79]:
df_combined['text_transformed'] = df_combined['text_transformed'].apply(lambda x: correct_spellings(x))

In [81]:
df_combined.to_sql('transformed_combination', dbConnection, if_exists='replace', index=False)