In [30]:
import GetOldTweets3 as got
import pandas as pd
import itertools
import datetime
import time

## Custom functions

In [98]:
def time_print(time_secs):
    d = datetime.datetime(1,1,1) + datetime.timedelta(seconds = time_secs)
    date_str = "{}{}{}{}".format(str(d.day-1)+" days, " if d.day>1 else "",
                                str(d.hour)+" hours, " if d.hour>0 else "",
                                str(d.minute)+" min, " if d.minute>0 else "",
                                str(d.second)+" secs")
    return date_str

def keywords_prep_sent(activity_words,product_words,n):
    with open('sentiment_words.txt', 'w') as f:
        for item in activity_words:
            f.write("%s\n" % item)
        for item in product_words:
            f.write("%s\n" % item)
    '''
    # Only used with the product frequency part
    with open('activity_words.txt', 'w') as f:
        for item in activity_words:
            f.write("%s\n" % item)

    with open('product_words.txt', 'w') as f:
        for item in product_words:
            f.write("%s\n" % item)
    '''     
    # Behavior words are all permutations of activity and product words
    bwords = list(map(" ".join, itertools.product(activity_words,product_words)))

    print("Number of activity words: {}".format(len(activity_words)))
    print("Number of product words: {}".format(len(product_words)))
    print("Number of permutations: {}".format(len(bwords)))
    print("Number of rows to be pulled: {}".format(len(bwords)*n) 
          if n>0
          else "All possible tweets for each keyword will be pulled")
    return bwords

def keywords_prep_beha(behaviour_list,n):
    with open('behaviour_words.txt', 'w') as f:
        for item in behaviour_list:
            for i in item:
                f.write("%s\n" % i)

    bwords = []
    for item in behaviour_list:
        activity = item.pop(0)
        for i in item:
            bwords.append(activity + " " + i)

    print("Number of permutations: {}".format(len(bwords)))
    print("Number of rows to be pulled: {}".format(len(bwords)*n) 
          if n>0
          else "All possible tweets for each keyword will be pulled")
    return bwords

## GetOldTweets3 enabler function

In [32]:
def get_tweets(query, top_only, start_date, end_date, max_tweets):
   
    # specifying tweet search criteria
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                                .setTopTweets(top_only)\
                                                .setSince(start_date)\
                                                .setUntil(end_date)\
                                                .setMaxTweets(max_tweets)
    
    # scraping tweets based on criteria
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    
    # creating list of tweets with the tweet attributes specified in the list comprehension
    text_tweets = [[tw.text,
                    tw.date,
                    tw.retweets,
                    tw.favorites,
                    tw.mentions,
                    tw.hashtags] for tw in tweet]
    
    # creating dataframe, assigning column names to list of tweets corresponding to tweet attributes
    tweets_df = pd.DataFrame(text_tweets, 
                            columns = ['Text','Date','Retweets','Favorites','Mentions','HashTags'])
    
    return tweets_df

## Scraper function for custom words

In [74]:
def custom_scraper(behaviour_words,fixedk,limit_keyword):
    tweets_temp = pd.DataFrame(columns = ['Text','Date','Retweets','Favorites','Mentions','HashTags'])
    total_loop_time = 0
    avg_loop_time = 0
    len_behaviour_words = len(behaviour_words)
    for i,word in enumerate(behaviour_words):
        start = time.time()

        custom_query = " ".join(fixedk)+" "+word
        tweet_out = get_tweets(custom_query,
                                top_only = False,
                                start_date = "2020-04-01",
                                end_date = "2020-08-01",
                                max_tweets = limit_keyword)
        tweets_temp = tweets_temp.append(tweet_out, ignore_index=True)

        end = time.time()
        total_loop_time += end-start
        avg_loop_time = total_loop_time/(i+1)
        time_left = (avg_loop_time) * (len_behaviour_words-i)

        out1 = "{} in {}: Query \"{}\" ({:.2f} secs) (Rows: {})".format(i,len_behaviour_words,custom_query,
                                                                        (end-start),len(tweet_out))
        out2 = "ETA: {}".format(time_print(time_left))
        print("{:<80s}{:<35s}".format(out1,out2))

    print("\nTotal query time: {}".format(time_print(total_loop_time)))
    return tweets_temp

##  Basic scrape code (not in use)

In [None]:
# Between a specific range of dates, with end date not inclusive
# And specifying that we want a max number of tweets in max_tweets.

# The main query word goes here, case insensitive
query = "lockdown"
tweets_df = get_tweets(query,
                        top_only = False,
                        start_date = "2020-04-01",
                        end_date = "2020-08-01",
                        max_tweets = 100000)
tweets_df.head(5)

## Pulling for Part 1 - Sentiment analysis

In [102]:
activity_words = ["bought","purchased","thinking of","looking for",
"thinking about","want to buy","recommend","suggest","think of","ordered"]

product_words = ["iphone","samsung galaxy","nokia","redmi","xiaomi",
                "oppo","realme","vivo","oneplus","huawei","honor","motorola"]

# Tweets for each behaviour words, <1 means all possible tweets
tweets_per_keyword = 0

sentiment_words = keywords_prep_sent(activity_words,product_words,tweets_per_keyword)
sentiment_words[:5]

Number of activity words: 10
Number of product words: 12
Number of permutations: 120
All possible tweets for each keyword will be pulled


['bought iphone',
 'bought samsung galaxy',
 'bought nokia',
 'bought redmi',
 'bought xiaomi']

In [107]:
fixed_key = ["lockdown"]
tweets_sentiment = custom_scraper(sentiment_words,fixed_key,tweets_per_keyword)

0 in 120: Query "lockdown bought iphone" (16.75 secs) (Rows: 125)               ETA: 33 min, 30 secs               
1 in 120: Query "lockdown bought samsung galaxy" (3.39 secs) (Rows: 8)          ETA: 19 min, 58 secs               
2 in 120: Query "lockdown bought nokia" (4.60 secs) (Rows: 17)                  ETA: 16 min, 12 secs               
3 in 120: Query "lockdown bought redmi" (5.02 secs) (Rows: 24)                  ETA: 14 min, 30 secs               
4 in 120: Query "lockdown bought xiaomi" (3.17 secs) (Rows: 19)                 ETA: 12 min, 43 secs               
5 in 120: Query "lockdown bought oppo" (3.24 secs) (Rows: 6)                    ETA: 11 min, 33 secs               
6 in 120: Query "lockdown bought realme" (6.38 secs) (Rows: 32)                 ETA: 11 min, 32 secs               
7 in 120: Query "lockdown bought vivo" (2.85 secs) (Rows: 8)                    ETA: 10 min, 41 secs               
8 in 120: Query "lockdown bought oneplus" (6.20 secs) (Rows: 40)        

71 in 120: Query "lockdown want to buy motorola" (1.23 secs) (Rows: 0)          ETA: 2 min, 38 secs                
72 in 120: Query "lockdown recommend iphone" (2.87 secs) (Rows: 6)              ETA: 2 min, 34 secs                
73 in 120: Query "lockdown recommend samsung galaxy" (1.03 secs) (Rows: 0)      ETA: 2 min, 30 secs                
74 in 120: Query "lockdown recommend nokia" (2.76 secs) (Rows: 3)               ETA: 2 min, 26 secs                
75 in 120: Query "lockdown recommend redmi" (1.02 secs) (Rows: 0)               ETA: 2 min, 22 secs                
76 in 120: Query "lockdown recommend xiaomi" (1.43 secs) (Rows: 0)              ETA: 2 min, 18 secs                
77 in 120: Query "lockdown recommend oppo" (3.07 secs) (Rows: 4)                ETA: 2 min, 14 secs                
78 in 120: Query "lockdown recommend realme" (2.87 secs) (Rows: 2)              ETA: 2 min, 11 secs                
79 in 120: Query "lockdown recommend vivo" (1.42 secs) (Rows: 0)        

In [108]:
#Sorting by date
tweets_sentiment.sort_values(by="Date", inplace=True, ascending=True)
print("Number of rows: {}".format(tweets_sentiment.shape[0]))

tweets_sentiment.drop_duplicates(subset=["Text"], inplace=True)
tweets_sentiment.reset_index(drop=True, inplace=True)
print("Number of rows after deleted duplicates: {}".format(tweets_sentiment.shape[0]))

tweets_sentiment.to_csv("tweets_sentiment.csv")
tweets_sentiment.head(5)

Number of rows: 1144
Number of rows after deleted duplicates: 1002


Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
0,"@XiaomiIndia @Xiaomi we hve ordered Mi 40"" TV ...",2020-04-01 03:59:59+00:00,0,0,@XiaomiIndia @Xiaomi,
1,"@Xiaomi Mi A2 Packed on 2018, July Purchased O...",2020-04-01 06:27:55+00:00,0,0,@Xiaomi,
2,@flipkartsupport recently just before the coun...,2020-04-01 08:17:43+00:00,0,0,@flipkartsupport,
3,#covid19UK I find that a lot of teenagers seem...,2020-04-01 10:26:16+00:00,0,0,,#covid19UK
4,Lockdown making me spend so much money - Just ...,2020-04-01 13:50:51+00:00,0,2,,


## Pulling for Part 2 - Behaviour analysis

In [103]:
behaviour_list = []
behaviour_list.append(["watching","netflix","amazon prime","hotstar"])
behaviour_list.append(["playing","video game","mobile game","xbox","playstation"])
behaviour_list.append(["shopping","flipkart","amazon"])

# Tweets for each behaviour keyword, <1 means all possible tweets
tweets_per_keyword = 0

behaviour_words = keywords_prep_beha(behaviour_list,tweets_per_keyword)
behaviour_words[:5]

Number of permutations: 9
All possible tweets for each keyword will be pulled


['watching netflix',
 'watching amazon prime',
 'watching hotstar',
 'playing video game',
 'playing mobile game']

In [105]:
fixed_key = ["lockdown"]
tweets_behaviour = custom_scraper(behaviour_words,fixed_key,tweets_per_keyword)

0 in 9: Query "lockdown watching netflix" (394.23 secs) (Rows: 3521)            ETA: 59 min, 8 secs                
1 in 9: Query "lockdown watching amazon prime" (26.61 secs) (Rows: 213)         ETA: 28 min, 3 secs                
2 in 9: Query "lockdown watching hotstar" (13.72 secs) (Rows: 109)              ETA: 16 min, 53 secs               
3 in 9: Query "lockdown playing video game" (108.67 secs) (Rows: 883)           ETA: 13 min, 34 secs               
4 in 9: Query "lockdown playing mobile game" (4.51 secs) (Rows: 23)             ETA: 9 min, 7 secs                 
5 in 9: Query "lockdown playing xbox" (41.00 secs) (Rows: 341)                  ETA: 6 min, 32 secs                
6 in 9: Query "lockdown playing playstation" (21.44 secs) (Rows: 170)           ETA: 4 min, 21 secs                
7 in 9: Query "lockdown shopping flipkart" (16.23 secs) (Rows: 117)             ETA: 2 min, 36 secs                
8 in 9: Query "lockdown shopping amazon" (59.92 secs) (Rows: 502)       

In [106]:
#Sorting by date
tweets_behaviour.sort_values(by="Date", inplace=True, ascending=True)
print("Number of rows: {}".format(tweets_behaviour.shape[0]))

tweets_behaviour.drop_duplicates(subset=["Text"], inplace=True)
tweets_behaviour.reset_index(drop=True, inplace=True)
print("Number of rows after deleted duplicates: {}".format(tweets_behaviour.shape[0]))

tweets_behaviour.to_csv("tweets_behaviour.csv")
tweets_behaviour.head(5)

Number of rows: 5879
Number of rows after deleted duplicates: 5512


Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
0,"To make this stop, we need a complete lockdown...",2020-04-01 00:17:50+00:00,0,1,,
1,Lockdown http://wpsbrittanyp.blogspot.com/2020...,2020-04-01 02:31:54+00:00,0,0,,
2,Day 8 of Lockdown: Recommending ‘USS Indianapo...,2020-04-01 02:32:13+00:00,2,8,,
3,@netflix @hulu @PrimeVideo yes I'm still watch...,2020-04-01 02:33:29+00:00,0,0,@netflix @hulu @PrimeVideo,#lockdown
4,could've been at a @dodgers game with a michi ...,2020-04-01 02:57:57+00:00,0,1,@Dodgers,
