In [1]:
import GetOldTweets3 as got
import pandas as pd
import itertools
import datetime
import time

## Custom functions

In [2]:
def time_print(time_secs):
    d = datetime.datetime(1,1,1) + datetime.timedelta(seconds = time_secs)
    date_str = "{}{}{}{}".format(str(d.day-1)+" days, " if d.day>1 else "",
                                str(d.hour)+" hours, " if d.hour>0 else "",
                                str(d.minute)+" min, " if d.minute>0 else "",
                                str(d.second)+" secs")
    return date_str

def keywords_prep_part2(behaviour_list,n):
    
    temp_set = set([k for i in behaviour_list for j in i for k in j.split()])
    with open('behaviour_words.txt', 'w') as f:
        for item in list(temp_set):
            f.write("%s\n" % item)

    bwords = []
    for item in behaviour_list:
        activity = item.pop(0)
        for i in item:
            bwords.append(activity + " " + i)

    print("Number of permutations: {}".format(len(bwords)))
    print("Number of max rows to be pulled: {}".format(len(bwords)*n) 
          if n>0
          else "All possible tweets for each keyword will be pulled")
    return bwords

## GetOldTweets3 enabler function

In [4]:
def get_tweets(query, top_only, start_date, end_date, max_tweets):
   
    # specifying tweet search criteria
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                                .setTopTweets(top_only)\
                                                .setSince(start_date)\
                                                .setUntil(end_date)\
                                                .setMaxTweets(max_tweets)
    
    # scraping tweets based on criteria
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    
    # creating list of tweets with the tweet attributes specified in the list comprehension
    text_tweets = [[tw.text,
                    tw.date,
                    tw.retweets,
                    tw.favorites,
                    tw.mentions,
                    tw.hashtags] for tw in tweet]
    
    # creating dataframe, assigning column names to list of tweets corresponding to tweet attributes
    tweets_df = pd.DataFrame(text_tweets, 
                            columns = ['Text','Date','Retweets','Favorites','Mentions','HashTags'])
    
    return tweets_df

## Scraper function for custom words

In [5]:
def custom_scraper(queries,fixedk,limit_keyword):
    tweets_temp = pd.DataFrame(columns = ['Text','Date','Retweets','Favorites','Mentions','HashTags'])
    total_loop_time = 0
    avg_loop_time = 0
    len_queries = len(queries)
    for i,word in enumerate(queries):
        start = time.time()

        custom_query = " ".join(fixedk)+" "+word
        tweet_out = get_tweets(custom_query,
                                top_only = False,
                                start_date = "2020-04-01",
                                end_date = "2020-08-01",
                                max_tweets = limit_keyword)
        tweets_temp = tweets_temp.append(tweet_out, ignore_index=True)

        end = time.time()
        total_loop_time += end-start
        avg_loop_time = total_loop_time/(i+1)
        time_left = (avg_loop_time) * (len_queries-i)

        out1 = "{} in {}: Query \"{}\" ({:.2f} secs) (Rows: {})".format(i,len_queries,custom_query,
                                                                        (end-start),len(tweet_out))
        out2 = "ETA: {}".format(time_print(time_left))
        print("{:<80s}{:<35s}".format(out1,out2))

    print("\nTotal query time: {}".format(time_print(total_loop_time)))
    return tweets_temp

##  Basic scrape code (not in use)

In [None]:
# Between a specific range of dates, with end date not inclusive
# And specifying that we want a max number of tweets in max_tweets.

# The main query word goes here, case insensitive
query = "lockdown"
tweets_df = get_tweets(query,
                        top_only = False,
                        start_date = "2020-04-01",
                        end_date = "2020-08-01",
                        max_tweets = 1)
tweets_df.head(5)

## Pulling for Part 2 - Behaviour analysis

In [3]:
behaviour_list = []
behaviour_list.append(["watched","movie","netflix","amazon prime","hotstar","youtube","online"])
behaviour_list.append(["watching","movie","netflix","amazon prime","hotstar","youtube","online"])
behaviour_list.append(["played","game","xbox","playstation","android","game online"])
behaviour_list.append(["playing","game","xbox","playstation","android","game online"])
behaviour_list.append(["shopping","flipkart","amazon","myntra","jabong","online"])
behaviour_list.append(["ordered","zomato","swiggy","food online"])
behaviour_list.append(["listening","music","saavn","spotify","gaana","youtube music","music online"])

# Tweets for each query keyword, <1 means all possible tweets
tweets_per_keyword = 4000

behaviour_words = keywords_prep_part2(behaviour_list,tweets_per_keyword)
behaviour_words[:5]

Number of permutations: 36
Number of max rows to be pulled: 144000


['watched movie',
 'watched netflix',
 'watched amazon prime',
 'watched hotstar',
 'watched youtube']

In [14]:
fixed_key = ["lockdown"]
tweets_behaviour = custom_scraper(behaviour_words,fixed_key,tweets_per_keyword)

0 in 36: Query "lockdown watched movie" (566.27 secs) (Rows: 4000)              ETA: 5 hours, 39 min, 45 secs      
1 in 36: Query "lockdown watched netflix" (776.26 secs) (Rows: 4000)            ETA: 6 hours, 31 min, 34 secs      
2 in 36: Query "lockdown watched amazon prime" (75.44 secs) (Rows: 539)         ETA: 4 hours, 27 min, 50 secs      
3 in 36: Query "lockdown watched hotstar" (31.51 secs) (Rows: 233)              ETA: 3 hours, 19 min, 18 secs      
4 in 36: Query "lockdown watched youtube" (182.58 secs) (Rows: 1042)            ETA: 2 hours, 54 min, 5 secs       
5 in 36: Query "lockdown watched online" (617.26 secs) (Rows: 4000)             ETA: 3 hours, 13 min, 41 secs      
6 in 36: Query "lockdown watching movie" (54.79 secs) (Rows: 374)               ETA: 2 hours, 44 min, 34 secs      
7 in 36: Query "lockdown watching netflix" (460.87 secs) (Rows: 3528)           ETA: 2 hours, 47 min, 3 secs       
8 in 36: Query "lockdown watching amazon prime" (29.69 secs) (Rows: 211)

In [15]:
#Sorting by date
tweets_behaviour.sort_values(by="Date", inplace=True, ascending=True)
print("Number of rows: {}".format(tweets_behaviour.shape[0]))
tweets_behaviour.drop_duplicates(subset=["Text"], inplace=True)
tweets_behaviour.reset_index(drop=True, inplace=True)
print("Number of rows after deleted duplicates: {}".format(tweets_behaviour.shape[0]))

tweets_behaviour.to_csv("tweets_behaviour.csv")
tweets_behaviour.head(5)

Number of rows: 38057
Number of rows after deleted duplicates: 33741


Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
0,"Since the moderated lockdown, I haven’t been l...",2020-04-01 00:02:05+00:00,0,0,,
1,so it's online and I thought I could look up t...,2020-04-01 00:03:09+00:00,0,1,,
2,what have i been doing during this time? cooki...,2020-04-01 00:06:12+00:00,0,0,,
3,"To make this stop, we need a complete lockdown...",2020-04-01 00:17:50+00:00,0,1,,
4,I’ve been listening to my music on shuffle sin...,2020-04-01 00:23:46+00:00,0,1,,
