In [136]:
# Generating dataframe for testing the cleaning functions
import pandas as pd
import numpy as np
null_message = ['a', None]
df = pd.DataFrame(null_message)

escape_chars_message = ['\t','\n','\s']

emojis_message = [':)','ðŸ˜€','\xF0\x9F\x98\x81']

non_english_message = ['W Szczebrzeszynie chrzÄ…szcz brzmi w trzcinie',
                       'Das ist nicht mein bier']

url_message = ['www.zombo.com/', 'www.zombo.com', 'https://www.google.co.uk/',
               'http://www.google.co.uk/',
               'http://pandas.pydata.org/pandas-docs/stable/dsintro.html']

In [137]:
def remove_empty_messages(df):
    """
    :param df - a pandas dataframe with a column containing a string named
    message and columns of metadata
    :return: df - a pandas dataframe with no rows containing messages with
    the type 'none' or 'nan'
    """

    return df[df['message'].notnull()].reset_index(drop=True)

In [138]:
null_message = ['a', None, '', ' ', ' ', None]
null_time = [None, 2, 3, 'a','b',None]
df = pd.DataFrame({'message':null_message, 'time':null_time})

In [189]:
df

Unnamed: 0,message,time
0,Hi Alex,1
1,Hi Ben Sent from my iPhone bye,2
2,Hi Clara Begin forwarded message bye,3
3,Hi Dawn \n--\n bye,4
4,Hi Elco Forwarded message bye,5
5,Hi Frances Sent from my iPad bye,6
6,Hi Grzegorz Sent from my Windows Phone bye,7
7,Hi Heidi Sent from my Samsung bye,8


In [140]:
df = remove_empty_messages(df)

In [144]:
assert remove_empty_messages(df)['message'][0] == 'a'

In [142]:
df.shape

(4, 2)

In [143]:
df

Unnamed: 0,message,time
0,a,
1,,3
2,,a
3,,b


In [146]:
assert remove_empty_messages(df).shape == (4, 2)
assert remove_empty_messages(df)['message'][0] == 'a'
assert remove_empty_messages(df)['time'][0] == None

In [171]:
def remove_signatures_and_after(df):
    """
    # Get rid of anything after these strings.
    :param df - a pandas dataframe with a column containing a string named
    message and columns of metadata:
    :return the same df but with all messages cleaned of email signatures
    that indicate the start of forwarded messages:
    """
    sep = ['\n--\n', 'Begin forwarded message', 'Forwarded message',
           '------', 'Sent from my iPhone', 'Sent from my iPad',
           'Sent from my Windows Phone', 'Sent from my Samsung']

    for s in sep:
        df['message'] = df['message'].apply(lambda x: x.split(s, 1)[0])
        

In [253]:
sig_msg = ['        Hi      Alex              ' , 'Hi Ben Sent from my iPhone bye',
           'Hi Clara Begin forwarded message bye'      ,
           'Hi Dawn \n--\n bye', 'Hi Elco Forwarded message bye',
           'Hi Frances Sent from my iPad bye',
           'Hi Grzegorz Sent from my Windows Phone bye',
           'Hi Heidi Sent from my Samsung bye']

sig_msg_pass = ['Hi Alex', 'Hi Ben', 'Hi Clara', 'Hi Dawn', 'Hi Elco',
                'Hi Frances', 'Hi Grzegorz', 'Hi Heidi']

sig_time = [1,2,3,4,5,6,7,8]

df = pd.DataFrame({'message':sig_msg,'time':sig_time})
df_pass = pd.DataFrame({'message': sig_msg_pass, 'time': sig_time})

In [254]:
df['message'][0]

'        Hi      Alex              '

In [217]:
df_pass

Unnamed: 0,message,time
0,Hi Alex,1
1,Hi Ben,2
2,Hi Clara,3
3,Hi Dawn,4
4,Hi Elco,5
5,Hi Frances,6
6,Hi Grzegorz,7
7,Hi Heidi,8


In [218]:
remove_signatures_and_after(df)

In [222]:
df

Unnamed: 0,message,time
0,Hi Alex,1
1,Hi Ben Sent from my iPhone bye,2
2,Hi Clara Begin forwarded message bye,3
3,Hi Dawn \n--\n bye,4
4,Hi Elco Forwarded message bye,5
5,Hi Frances Sent from my iPad bye,6
6,Hi Grzegorz Sent from my Windows Phone bye,7
7,Hi Heidi Sent from my Samsung bye,8


In [255]:
def remove_excess_whitespace(df):
    """
    :param df - a dataframe with a series named 'message' containing strings:
    :return: df - as above but the strings have leading, trailing and excess
    whitespace removed
    """
    df['message'] = df['message'].str.replace('[\s]+', ' ', case=False, flags=re.MULTILINE)
    df['message'] = df['message'].str.strip()
    
    return df

In [256]:
df = remove_excess_whitespace(df)

In [257]:
df['message'][0]

'Hi Alex'

In [234]:
df['message'][0]

'Hi Alex'

In [270]:
def remove_urls(df):
    """
    Takes a dataframe with a Series named 'message' consisting of strings and
    returns the same dataframe but with the message stripped of urls
    :param df:
    :return df:
    """

    subs = ["On\s[A-Z][a-z]{2}\s[0-9]{1,3}[\s\S]*",
            r'https?:\/\/[\S]*[\s\n\r]+', r'www\.[\S]*[\s\n\r]+',
            r'https?:\/\/[\S]*$', r'www\.[\S]*$']
    for s in subs:
        df['message'] = df['message'].str.replace(s, ' ', case=False,
                                                  flags=re.MULTILINE)

    df = remove_excess_whitespace(df)

    return df

In [271]:

url_message = ['I like www.zombo.com/, really I do',
               'I like www.zombo.com',
               'what is your favourite?https://www.google.co.uk/',
               'I lovehttp://www.google.co.uk/',
               'does this work http://pandas.pydata.org/pandas-docs/stable/dsintro.html I hope so']

url_message_pass = ['I like really I do', 'I like',
                    'what is your favourite?',
                    'I love',
                    'does this work I hope so']

url_time = [1,2,3,4,5]

df = pd.DataFrame({'message': url_message, 'time': url_time})
df_pass = pd.DataFrame({'message': url_message_pass, 'time': url_time})



In [272]:
df

Unnamed: 0,message,time
0,"I like www.zombo.com/, really I do",1
1,I like www.zombo.com,2
2,what is your favourite?https://www.google.co.uk/,3
3,I lovehttp://www.google.co.uk/,4
4,does this work http://pandas.pydata.org/pandas...,5


In [273]:
df_test = remove_urls(df)

In [274]:
df_test

Unnamed: 0,message,time
0,I like really I do,1
1,I like,2
2,what is your favourite?,3
3,I love,4
4,does this work I hope so,5


In [275]:
df_pass

Unnamed: 0,message,time
0,I like really I do,1
1,I like,2
2,what is your favourite?,3
3,I love,4
4,does this work I hope so,5


In [278]:
assert df_pass.equals(df_test)