In [1]:
import pandas as pd
import re
import collections
import csv
#Sentiment Analysis

In [2]:
#Different Types Of Tweets
#Source:https://help.twitter.com/en/using-twitter/types-of-tweets
#1-Mentions
#2-Replies
#3-General Tweets
#4-Retweets


In [3]:
#Reads in the refined data to perform data analysis
pd.set_option('max_colwidth', 400)
df = pd.read_csv('./data/CleanedCometLanding.csv')

In [4]:
''' This function returns the number of mention tweets'''
def getNumberOfMentionTweets(df): 
    counter = 0;
    textColumn = df['text']
    
    for (columnName, columnData) in textColumn.iteritems():

        match = re.search('RT @' , columnData)
        
        if match == None:
            match = re.search('@' , columnData)
            if match != None:
                counter = counter + 1
                    
    return counter

In [5]:
#https://www.geeksforgeeks.org/loop-or-iterate-over-all-or-certain-columns-of-a-dataframe-in-python-pandas/
''' This function returns the number of retweets'''
def getNumberOfRetweets(df):
    counter = 0;
    textColumn = df['text']
    
    for (columnName, columnData) in textColumn.iteritems():

        match = re.search('RT @' , columnData)
        
        if match != None:
            counter = counter + 1
            
    return counter    
        

In [20]:
''' This function returns the number of reply tweets '''

#Source:https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet
def getNumberOfReplies(df):
     
    replies = df["in_reply_to_user_id_str"].notnull().sum()  
    
    return replies
    

In [7]:
''' This function returns the number of general tweets'''
def getNumberOfGeneralTweets(df):
    mentions = getNumberOfMentionTweets(df)
    retweets = getNumberOfRetweets(df)
    replies = getNumberOfReplies(df)
    
    NumberOfGeneralTweets = len(df) - retweets - mentions - replies 
    return  NumberOfGeneralTweets

In [8]:
''' This function returns total number of tweets'''
def getTotalNumberOfTweets(df):
    return len(df)

In [9]:
''' This function returns the number of different users in the dataset'''
def getNumberOfDifferentUsers(df):
    differentUsers = df['from_user'].unique()
    return len(differentUsers)

In [10]:
''' This function analyses the basic user behaviour''' 
''' The function prints the average number of tweets, replies, mentions, and retweets sent by a user '''
def basicUserInteractionAnalysis(df):
    differentUsers = getNumberOfDifferentUsers(df)
    averageGeneralTweetsPerUser = getNumberOfGeneralTweets(df) / differentUsers
    averageMentionsPerUser = getNumberOfMentionTweets(df) / differentUsers
    averageRetweetsPerUser = getNumberOfRetweets(df) / differentUsers
    averageRepliesPerUser = getNumberOfReplies(df) / differentUsers
    
    print("The average number of general tweets per user is:", averageGeneralTweetsPerUser)
    print("The average number of mentions tweets per user is:", averageMentionsPerUser)
    print("The average number of retweets per user is:", averageRetweetsPerUser)
    print("The average number of replies per user is:", averageRepliesPerUser)
    

In [11]:
''' This function prints the 5 most populat hashtag in the dataset''' 
''' This function also saves all hashtags to another file for visualisation''' 
def getMostPopularHashtags(df):
    hashtags = [] 
    textColumn = df['text']
    mentionPattern = '@([a-zA-Z]+)'
    
    for (columnName, columnData) in textColumn.iteritems():  
        matchList = re.findall(mentionPattern , columnData)
        hashtags.extend(matchList)
    
    counter = collections.Counter(hashtags)
    
    newDataFrame = pd.DataFrame(hashtags)
    newDataFrame.to_csv('./data/HashtagsCI.csv',index = False)        
    
    return counter.most_common(5)

In [12]:
''' This function prints the 5 most popular hashtag in the dataset'''
''' On top of the function above, it ensures the hashtags contain different contextual data'''
def getMostPopularCaseInsensitiveHashtags(df):
    hashtags = [] 
    textColumn = df['text']
    mentionPattern = '@([a-zA-Z]+)'
    
    for (columnName, columnData) in textColumn.iteritems():  
        matchList = re.findall(mentionPattern , columnData)
        hashtags.extend(matchList)
    
        
    counter = collections.Counter(map(str.lower,hashtags))
    
    newDataFrame = pd.DataFrame(hashtags)
    newDataFrame.to_csv('./data/Hashtags.csv',index = False)    
    
    return counter.most_common(5)
            

In [13]:
''' This function '''
# Each time data is in user's local time, adjusting time data 
# to reference data from a single point of source(such as GMT) is not
# possible for this practical since timezones were not provided in the given CSV file

def getTweetDataAboutTime(df):
    hoursList = []
    days = []
    dates = []
    createdAtColumn = df['created_at']
    timeColumn = df['time']
    
    patternForHours = r'([01]\d|2[0-3]):([0-5]\d):([0-5]\d)'
    patternForDays = r'^Mon|Tue|Wed|Thu|Fri|Sat|Sun$'
    patternForDates = r'(3[01]|[12][0-9]|0[1-9])/(1[0-2]|0[1-9])/([0-9]{4})'
    #dd/mm/yyyy/[0-9]{4}$'
    
    for (columnName, columnData) in createdAtColumn.iteritems():  
        hoursList_tuple = re.findall(patternForHours , columnData )
        daysList  = re.findall(patternForDays, columnData )
        days.extend(daysList)
        hoursList.extend(hoursList_tuple)
    
    
    hours = [x[0] for x in hoursList]
      
    for (columnName, columnData) in timeColumn.iteritems():  
        dateList = re.findall(patternForDates , columnData)
        dates.extend(dateList)
       
        
    counterHours = collections.Counter(hours)
    counterDays = collections.Counter(days)
    counterDates = collections.Counter(dates)
    
    newDataFrame = pd.DataFrame(counterHours.most_common(5))
    newDataFrame.to_csv('./data/Hours.csv',index = False, header=False) 
    
    newDataFrame = pd.DataFrame(counterDays.most_common(5))
    newDataFrame.to_csv('./data/Days.csv',index = False , header=False)
    
    newDataFrame = pd.DataFrame(counterDates.most_common(5))
    newDataFrame.to_csv('./data/Date.csv',index = False , header=False)
    

In [14]:
#''' Replacement for switch case statement is to use a dictionary mapping'''
#def applicationParser(argument):
#    switcher = {
#        'Twitter Web Client' : 'Mobile Browser',
#        'Twitter for Websites': 'Browser',
#        'Twitter for iPad' : 'iPad App',
#        'Twitter for iPhone' : 'iPhone App' ,
#        'Twitter for Android': 'Android App',
#        'Twitter for BlackBerryÂ®': 'BlackBerry App'
#    }

#    return switcher.get(argument, "nothing")

In [15]:
''' This function returns the most popular 5 app used to send the tweets'''
''' This function also returns a seperate file with all the apps used for visualisation'''

def getMostPopularApplicationsUsed(df):
    applications = [] 
    whichApplicationColumn = df['source']
    
    patternForApplications = r'(?<=>).*(?=<)'
    
    for (columnName, columnData) in whichApplicationColumn.iteritems():

        matchList = re.findall(patternForApplications, str(columnData))
        applications.extend(matchList)
    
    counterApplications = collections.Counter(applications)
    
    newDataFrame = pd.DataFrame(counterApplications.most_common(5))
    newDataFrame.to_csv('./data/applications.csv',index = False, header=False) 

    
    return counterApplications.most_common(5)
    
    

In [16]:
def performDataAnalysis(df):
    print ('The number of mention tweets is' , getNumberOfMentionTweets(df))
    print ('The number of retweets is' , getNumberOfRetweets(df))
    print ('The number of reply tweets is' , getNumberOfReplies(df))
    print ('The number of general tweets is' , getNumberOfGeneralTweets(df))
    print ('The number of total number of tweets is' , getTotalNumberOfTweets(df))
    print ('The number of different users is' , getNumberOfDifferentUsers(df))
    basicUserInteractionAnalysis(df)
    print('5 most popular hashtags with their respective occurances are',getMostPopularHashtags(df))
    print('5 contextually different most popular hashtags with their respective occurances are' 
          ,getMostPopularCaseInsensitiveHashtags(df))
    getTweetDataAboutTime(df)
    print('5 most popular apps to send tweets are', getMostPopularApplicationsUsed(df))

In [17]:
performDataAnalysis(df)

The number of mention tweets is 6239
The number of retweets is 58284
The number of reply tweets is 1686
The number of general tweets is 8858
The number of total number of tweets is 75067
The number of different users is 49249
The average number of general tweets per user is: 0.17986152003086356
The average number of mentions tweets per user is: 0.12668277528477737
The average number of retweets per user is: 1.183455501634551
The average number of replies per user is: 0.03423419764868322
5 most popular hashtags with their respective occurances are [('Philae', 30553), ('ESA', 22531), ('esa', 4386), ('philae', 3258), ('esaoperations', 3116)]
5 contextually different most popular hashtags with their respective occurances are [('philae', 33811), ('esa', 26938), ('esaoperations', 3122), ('nasa', 2477), ('bbcbreaking', 2177)]
5 most popular apps to send tweets are [('Twitter Web Client', 27228), ('Twitter for iPhone', 13253), ('Twitter for Android', 12341), ('TweetDeck', 3958), ('Twitter for 

In [18]:
df

Unnamed: 0,id_str,from_user,text,created_at,time,geo_coordinates,user_lang,in_reply_to_user_id_str,in_reply_to_screen_name,from_user_id_str,in_reply_to_status_id_str,source,profile_image_url,user_followers_count,user_friends_count,status_url,entities_str
0,5.409300e+17,MHuuskoL,RT @EUCouncil: After the #CometLanding - Astronaut @astro_luca discussed space policy with ministers today: http://t.co/ZjKAgpXhkt http://t,Fri Dec 05 18:05:36 +0000 2014,05/12/2014 18:05,,en,,,5.744348e+08,,"<a href=""http://twitter.com/#!/download/ipad"" rel=""nofollow"">Twitter for iPad</a>",http://pbs.twimg.com/profile_images/497351058826756096/UMRFSFMf_normal.jpeg,215.0,310.0,http://twitter.com/MHuuskoL/statuses/540929998388211713,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[25,38]}],""symbols"":[],""user_mentions"":[{""screen_name"":""EUCouncil"",""name"":""EU Council"",""id"":206717989,""id_str"":""206717989"",""indices"":[3,13]},{""screen_name"":""astro_luca"",""name"":""Luca Parmitano"",""id"":290876018,""id_str"":""290876018"",""indices"":[51,62]}],""urls"":[{""url"":""http://t.co/ZjKAgpXhkt"",""expanded_url"":""http://ow.ly/FqZCE"",""display_url"":""ow.ly/FqZC..."
1,5.409293e+17,SaraGomezAranci,RT @EUCouncil: After the #CometLanding - Astronaut @astro_luca discussed space policy with ministers today: http://t.co/ZjKAgpXhkt http://t,Fri Dec 05 18:03:00 +0000 2014,05/12/2014 18:03,,fr,,,1.057356e+09,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",http://pbs.twimg.com/profile_images/539170580478828544/d0diSAVk_normal.jpeg,741.0,310.0,http://twitter.com/SaraGomezAranci/statuses/540929346878599168,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[25,38]}],""symbols"":[],""user_mentions"":[{""screen_name"":""EUCouncil"",""name"":""EU Council"",""id"":206717989,""id_str"":""206717989"",""indices"":[3,13]},{""screen_name"":""astro_luca"",""name"":""Luca Parmitano"",""id"":290876018,""id_str"":""290876018"",""indices"":[51,62]}],""urls"":[{""url"":""http://t.co/ZjKAgpXhkt"",""expanded_url"":""http://ow.ly/FqZCE"",""display_url"":""ow.ly/FqZC..."
2,5.409292e+17,CBCDay6,RT @shaunmajumder: Feels good to be the @CBCDay6 champion beating #MiniMansbridge @davidcommon and @SusanKent #CometLanding #LetItSnow #MtR,Fri Dec 05 18:02:32 +0000 2014,05/12/2014 18:02,,en,,,1.772769e+08,,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",http://pbs.twimg.com/profile_images/459069430371086336/VvLJFxdc_normal.jpeg,4331.0,2098.0,http://twitter.com/CBCDay6/statuses/540929229052203008,"{""hashtags"":[{""text"":""MiniMansbridge"",""indices"":[66,81]},{""text"":""CometLanding"",""indices"":[110,123]},{""text"":""LetItSnow"",""indices"":[124,134]},{""text"":""MtRushmore"",""indices"":[135,140]}],""symbols"":[],""user_mentions"":[{""screen_name"":""shaunmajumder"",""name"":""Shaun Majumder"",""id"":17680518,""id_str"":""17680518"",""indices"":[3,17]},{""screen_name"":""CBCDay6"",""name"":""CBC Day 6"",""id"":177276897,""id_str"":""17727..."
3,5.409287e+17,MBernacconi,RT @EUCouncil: After the #CometLanding - Astronaut @astro_luca discussed space policy with ministers today: http://t.co/ZjKAgpXhkt http://t,Fri Dec 05 18:00:28 +0000 2014,05/12/2014 18:00,,en,,,4.790749e+07,,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",http://pbs.twimg.com/profile_images/533194305905491968/TDcZsosT_normal.jpeg,317.0,478.0,http://twitter.com/MBernacconi/statuses/540928707947671555,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[25,38]}],""symbols"":[],""user_mentions"":[{""screen_name"":""EUCouncil"",""name"":""EU Council"",""id"":206717989,""id_str"":""206717989"",""indices"":[3,13]},{""screen_name"":""astro_luca"",""name"":""Luca Parmitano"",""id"":290876018,""id_str"":""290876018"",""indices"":[51,62]}],""urls"":[{""url"":""http://t.co/ZjKAgpXhkt"",""expanded_url"":""http://ow.ly/FqZCE"",""display_url"":""ow.ly/FqZC..."
4,5.409285e+17,LukeGolds,USA NEWS HOT NEWS 828 comet landing Philae touches down on the surface of a comet #cometlanding heyyouapp T... http://t.co/C4XSjBq7G2,Fri Dec 05 17:59:42 +0000 2014,05/12/2014 17:59,,en,,,6.196297e+08,,"<a href=""http://ifttt.com"" rel=""nofollow"">IFTTT</a>",http://pbs.twimg.com/profile_images/2344739619/zkh6mdlcodzt9j64znty_normal.jpeg,350.0,,http://twitter.com/LukeGolds/statuses/540928513906589696,"{""hashtags"":[{""text"":""cometlanding"",""indices"":[87,100]}],""symbols"":[],""user_mentions"":[],""urls"":[{""url"":""http://t.co/C4XSjBq7G2"",""expanded_url"":""http://dlvr.it/7knHzd"",""display_url"":""dlvr.it/7knHzd"",""indices"":[116,138]}]}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75062,5.324601e+17,ABForScience,This means that the actual landing will be around 3am aus Eastern daylight savings time #CometLanding,Wed Nov 12 09:09:26 +0000 2014,12/11/2014 09:09,,en,,,2.333123e+09,,"<a href=""http://www.tweetcaster.com"" rel=""nofollow"">TweetCaster for Android</a>",http://pbs.twimg.com/profile_images/432069559336448000/ig-lYPQp_normal.png,155.0,212.0,http://twitter.com/ABForScience/statuses/532460149084930049,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[89,102]}],""symbols"":[],""user_mentions"":[],""urls"":[]}"
75063,5.324601e+17,atieyK,RT @ObservingSpace: Weve been waiting 10 years.. we see it in the telemetry. Separation went flawlessly Andrea Accomazzo #cometlanding h,Wed Nov 12 09:09:26 +0000 2014,12/11/2014 09:09,,en,,,5.830372e+07,,"<a href=""https://twitter.com/download/android"" rel=""nofollow"">Twitter for Android Tablets</a>",http://pbs.twimg.com/profile_images/505731163713843200/yyQvaSgE_normal.jpeg,354.0,1088.0,http://twitter.com/atieyK/statuses/532460148451581952,"{""hashtags"":[{""text"":""cometlanding"",""indices"":[124,137]}],""symbols"":[],""user_mentions"":[{""screen_name"":""ObservingSpace"",""name"":""Observing Space"",""id"":1041442471,""id_str"":""1041442471"",""indices"":[3,18]}],""urls"":[],""media"":[{""id"":532460012690354200,""id_str"":""532460012690354176"",""indices"":[139,140],""media_url"":""http://pbs.twimg.com/media/B2Otj3mCMAAiq3S.png"",""media_url_https"":""https://pbs.twimg.co..."
75064,5.324601e+17,j0nny5,"RT @maxplanckpress: Accomazzo (flight director): ""Separation worked out definitely - philae is gone towards the comet"" #CometLanding (fm)",Wed Nov 12 09:09:26 +0000 2014,12/11/2014 09:09,,en,,,1.791486e+07,,"<a href=""http://twitter.com"" rel=""nofollow"">Twitter Web Client</a>",http://pbs.twimg.com/profile_images/89991673/jonnnn_normal.jpg,92.0,63.0,http://twitter.com/j0nny5/statuses/532460148254470144,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[119,132]}],""symbols"":[],""user_mentions"":[{""screen_name"":""maxplanckpress"",""name"":""Max Planck Society"",""id"":205195655,""id_str"":""205195655"",""indices"":[3,18]}],""urls"":[]}"
75065,5.324601e+17,nsentse,7 hours of waiting #CometLanding,Wed Nov 12 09:09:26 +0000 2014,12/11/2014 09:09,,en,,,2.043400e+07,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",http://pbs.twimg.com/profile_images/3693655850/1d56869aa53a8c796d68e5cc0308bca6_normal.jpeg,200.0,267.0,http://twitter.com/nsentse/statuses/532460148238090240,"{""hashtags"":[{""text"":""CometLanding"",""indices"":[19,32]}],""symbols"":[],""user_mentions"":[],""urls"":[]}"
