In [1]:
import pandas as pd
import re


In [2]:
pd.set_option('max_colwidth', 400)

In [3]:
df = pd.read_csv('./data/CometLanding.csv')

In [4]:
''' Removes duplicates along with other basic data cleaning '''
def basicDataCleaning(df):
    df = df.drop_duplicates(subset=['status_url'], keep='first')
    df = df.replace({r'[^\x00-\x7F]+':''}, regex=True)
    #df = df.dropna()
    return df

In [5]:
df = basicDataCleaning(df)

In [6]:
''' Removes usernames that are not valid from the data set (according to Twitter specifications) '''
''' Usernames that are longer than 15 Characters are not valid '''
''' Usernames that are not alphanumeric are not valid with the exception of underscores '''
''' Usernames containing the words Twitter or Admin cannot be claimed. '''

def validateUserName(df):
    
    df = df.drop(df[df['from_user'].str.len() > 15].index)  
    df = df.drop(df[df['from_user'].apply(lambda x: re.search(r'[a-zA-Z0-9_]', x)) == None].index)
    df = df.drop(df[df['from_user'].apply(lambda x: re.search('Twitter', x, re.IGNORECASE) != None)].index)
    df = df.drop(df[df['from_user'].apply(lambda x: re.search('Admin', x, re.IGNORECASE) != None)].index)                 
    return df

In [7]:
df = validateUserName(df)

In [8]:
''' Refines language data '''
''' Turns data such as en-gb to en only to achieve consistency'''
''' Turns all data lower-case to achieve consistency'''

def refineLanguageData(df):
    df['user_lang'] = df['user_lang'].str.lower()
    df['user_lang'] = df["user_lang"].replace({'en-gb':'en'}, regex=True)
    return df

In [9]:
df = refineLanguageData(df)

In [10]:
'''Removes the data that do not follow twitter specifications for tweet length'''
'''A tweet could contain maximum 140 characters before November 8th 2017'''
'''Any data that do not match this specification is questionable'''
def validateTweetLength(df):
    
    df = df.drop(df[df['text'].str.len() > 140].index)
    return df

In [11]:
df = validateTweetLength(df)

In [12]:
df

Unnamed: 0,id_str,from_user,text,created_at,time,geo_coordinates,user_lang,in_reply_to_user_id_str,in_reply_to_screen_name,from_user_id_str,in_reply_to_status_id_str,source,profile_image_url,user_followers_count,user_friends_count,status_url,entities_str
2,5.409300e+17,MHuuskoL,RT @EUCouncil: After the #CometLanding - Astronaut @astro_luca discussed space policy with ministers today: http://t.co/ZjKAgpXhkt http://t,Fri Dec 05 18:05:36 +0000 2014,05/12/2014 18:05,,en,,,5.744348e+08,,"<a href=""http://twitter.com/#!/download/ipad"" rel=""nofollow"">Twitter for iPad</a>",http://pbs.twimg.com/profile_images/497351058826756096/UMRFSFMf_normal.jpeg,215.0,310.0,http://twitter.com/MHuuskoL/statuses/540929998388211713,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[25,38]}],""symbols"":[],""user_mentions"":[{""screen_name"":""EUCouncil"",""name"":""EU Council"",""id"":206717989,""id_str"":""206717989"",""indices"":[3,13]},{""screen_name"":""astro_luca"",""name"":""Luca Parmitano"",""id"":290876018,""id_str"":""290876018"",""indices"":[51,62]}],""urls"":[{""url"":""http://t.co/ZjKAgpXhkt"",""expanded_url"":""http://ow.ly/FqZCE"",""display_url"":""ow.ly/FqZC..."
3,5.409293e+17,SaraGomezAranci,RT @EUCouncil: After the #CometLanding - Astronaut @astro_luca discussed space policy with ministers today: http://t.co/ZjKAgpXhkt http://t,Fri Dec 05 18:03:00 +0000 2014,05/12/2014 18:03,,fr,,,1.057356e+09,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",http://pbs.twimg.com/profile_images/539170580478828544/d0diSAVk_normal.jpeg,741.0,310.0,http://twitter.com/SaraGomezAranci/statuses/540929346878599168,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[25,38]}],""symbols"":[],""user_mentions"":[{""screen_name"":""EUCouncil"",""name"":""EU Council"",""id"":206717989,""id_str"":""206717989"",""indices"":[3,13]},{""screen_name"":""astro_luca"",""name"":""Luca Parmitano"",""id"":290876018,""id_str"":""290876018"",""indices"":[51,62]}],""urls"":[{""url"":""http://t.co/ZjKAgpXhkt"",""expanded_url"":""http://ow.ly/FqZCE"",""display_url"":""ow.ly/FqZC..."
4,5.409292e+17,CBCDay6,RT @shaunmajumder: Feels good to be the @CBCDay6 champion beating #MiniMansbridge @davidcommon and @SusanKent #CometLanding #LetItSnow #MtR,Fri Dec 05 18:02:32 +0000 2014,05/12/2014 18:02,,en,,,1.772769e+08,,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",http://pbs.twimg.com/profile_images/459069430371086336/VvLJFxdc_normal.jpeg,4331.0,2098.0,http://twitter.com/CBCDay6/statuses/540929229052203008,"{""hashtags"":[{""text"":""MiniMansbridge"",""indices"":[66,81]},{""text"":""CometLanding"",""indices"":[110,123]},{""text"":""LetItSnow"",""indices"":[124,134]},{""text"":""MtRushmore"",""indices"":[135,140]}],""symbols"":[],""user_mentions"":[{""screen_name"":""shaunmajumder"",""name"":""Shaun Majumder"",""id"":17680518,""id_str"":""17680518"",""indices"":[3,17]},{""screen_name"":""CBCDay6"",""name"":""CBC Day 6"",""id"":177276897,""id_str"":""17727..."
5,5.409287e+17,MBernacconi,RT @EUCouncil: After the #CometLanding - Astronaut @astro_luca discussed space policy with ministers today: http://t.co/ZjKAgpXhkt http://t,Fri Dec 05 18:00:28 +0000 2014,05/12/2014 18:00,,en,,,4.790749e+07,,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",http://pbs.twimg.com/profile_images/533194305905491968/TDcZsosT_normal.jpeg,317.0,478.0,http://twitter.com/MBernacconi/statuses/540928707947671555,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[25,38]}],""symbols"":[],""user_mentions"":[{""screen_name"":""EUCouncil"",""name"":""EU Council"",""id"":206717989,""id_str"":""206717989"",""indices"":[3,13]},{""screen_name"":""astro_luca"",""name"":""Luca Parmitano"",""id"":290876018,""id_str"":""290876018"",""indices"":[51,62]}],""urls"":[{""url"":""http://t.co/ZjKAgpXhkt"",""expanded_url"":""http://ow.ly/FqZCE"",""display_url"":""ow.ly/FqZC..."
6,5.409285e+17,LukeGolds,USA NEWS HOT NEWS 828 comet landing Philae touches down on the surface of a comet #cometlanding heyyouapp T... http://t.co/C4XSjBq7G2,Fri Dec 05 17:59:42 +0000 2014,05/12/2014 17:59,,en,,,6.196297e+08,,"<a href=""http://ifttt.com"" rel=""nofollow"">IFTTT</a>",http://pbs.twimg.com/profile_images/2344739619/zkh6mdlcodzt9j64znty_normal.jpeg,350.0,,http://twitter.com/LukeGolds/statuses/540928513906589696,"{""hashtags"":[{""text"":""cometlanding"",""indices"":[87,100]}],""symbols"":[],""user_mentions"":[],""urls"":[{""url"":""http://t.co/C4XSjBq7G2"",""expanded_url"":""http://dlvr.it/7knHzd"",""display_url"":""dlvr.it/7knHzd"",""indices"":[116,138]}]}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77314,5.324601e+17,ABForScience,This means that the actual landing will be around 3am aus Eastern daylight savings time #CometLanding,Wed Nov 12 09:09:26 +0000 2014,12/11/2014 09:09,,en,,,2.333123e+09,,"<a href=""http://www.tweetcaster.com"" rel=""nofollow"">TweetCaster for Android</a>",http://pbs.twimg.com/profile_images/432069559336448000/ig-lYPQp_normal.png,155.0,212.0,http://twitter.com/ABForScience/statuses/532460149084930049,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[89,102]}],""symbols"":[],""user_mentions"":[],""urls"":[]}"
77315,5.324601e+17,atieyK,RT @ObservingSpace: Weve been waiting 10 years.. we see it in the telemetry. Separation went flawlessly Andrea Accomazzo #cometlanding h,Wed Nov 12 09:09:26 +0000 2014,12/11/2014 09:09,,en,,,5.830372e+07,,"<a href=""https://twitter.com/download/android"" rel=""nofollow"">Twitter for Android Tablets</a>",http://pbs.twimg.com/profile_images/505731163713843200/yyQvaSgE_normal.jpeg,354.0,1088.0,http://twitter.com/atieyK/statuses/532460148451581952,"{""hashtags"":[{""text"":""cometlanding"",""indices"":[124,137]}],""symbols"":[],""user_mentions"":[{""screen_name"":""ObservingSpace"",""name"":""Observing Space"",""id"":1041442471,""id_str"":""1041442471"",""indices"":[3,18]}],""urls"":[],""media"":[{""id"":532460012690354200,""id_str"":""532460012690354176"",""indices"":[139,140],""media_url"":""http://pbs.twimg.com/media/B2Otj3mCMAAiq3S.png"",""media_url_https"":""https://pbs.twimg.co..."
77316,5.324601e+17,j0nny5,"RT @maxplanckpress: Accomazzo (flight director): ""Separation worked out definitely - philae is gone towards the comet"" #CometLanding (fm)",Wed Nov 12 09:09:26 +0000 2014,12/11/2014 09:09,,en,,,1.791486e+07,,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",http://pbs.twimg.com/profile_images/89991673/jonnnn_normal.jpg,92.0,63.0,http://twitter.com/j0nny5/statuses/532460148254470144,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[119,132]}],""symbols"":[],""user_mentions"":[{""screen_name"":""maxplanckpress"",""name"":""Max Planck Society"",""id"":205195655,""id_str"":""205195655"",""indices"":[3,18]}],""urls"":[]}"
77317,5.324601e+17,nsentse,7 hours of waiting #CometLanding,Wed Nov 12 09:09:26 +0000 2014,12/11/2014 09:09,,en,,,2.043400e+07,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",http://pbs.twimg.com/profile_images/3693655850/1d56869aa53a8c796d68e5cc0308bca6_normal.jpeg,200.0,267.0,http://twitter.com/nsentse/statuses/532460148238090240,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[19,32]}],""symbols"":[],""user_mentions"":[],""urls"":[]}"


In [13]:
''' Creates new CSV file with the cleaned dataset to use in analysis ''' 
def createCleanedCSV(df):
    df.to_csv("./data/CleanedCometLanding.csv", index=False)

In [14]:
createCleanedCSV(df)

In [None]:
def main():
    #77319 rows × 17 columns original data
    df = pd.read_csv('CometLanding.csv')
    df = basicDataCleaning(df)
    

if __name__ == "__main__":
    main()