In [1]:
import GetOldTweets3 as got
import pandas as pd
import itertools
import datetime
import time

In [82]:
def time_print(time_secs):
    d = datetime.datetime(1,1,1) + datetime.timedelta(seconds = time_secs)
    date_str = "{}{}{}{}".format(str(d.day-1)+" days, " if d.day>1 else "",
                                str(d.hour)+" hours, " if d.hour>0 else "",
                                str(d.minute)+" min, " if d.minute>0 else "",
                                str(d.second)+" secs")
    return date_str

### Basic GetOldTweets3 function

In [83]:
def get_tweets(query, top_only, start_date, end_date, max_tweets):
   
    # specifying tweet search criteria
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                                .setTopTweets(top_only)\
                                                .setSince(start_date)\
                                                .setUntil(end_date)\
                                                .setMaxTweets(max_tweets)
    
    # scraping tweets based on criteria
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    
    # creating list of tweets with the tweet attributes specified in the list comprehension
    text_tweets = [[tw.text,
                    tw.date,
                    tw.retweets,
                    tw.favorites,
                    tw.mentions,
                    tw.hashtags] for tw in tweet]
    
    # creating dataframe, assigning column names to list of tweets corresponding to tweet attributes
    tweets_df = pd.DataFrame(text_tweets, 
                            columns = ['Text','Date','Retweets','Favorites','Mentions','HashTags'])
    
    return tweets_df

### Custom scraper function to use with behaviour words

In [84]:
def custom_scraper(behaviour_words,limit_keyword):
    tweets_temp = pd.DataFrame(columns = ['Text','Date','Retweets','Favorites','Mentions','HashTags'])
    total_loop_time = 0
    avg_loop_time = 0
    len_behaviour_words = len(behaviour_words)
    for i,word in enumerate(behaviour_words):
        start = time.time()

        custom_query = "lockdown " + word
        tweet_out = get_tweets(custom_query,
                                top_only = False,
                                start_date = "2020-04-01",
                                end_date = "2020-08-01",
                                max_tweets = limit_keyword)
        tweets_temp = tweets_temp.append(tweet_out, ignore_index=True)

        end = time.time()
        total_loop_time += end-start
        avg_loop_time = total_loop_time/(i+1)
        time_left = (avg_loop_time) * (len_behaviour_words-i)

        out1 = "{} in {}: Query \"{}\" ({:.2f} secs) (Rows: {})".format(i,len_behaviour_words,word,
                                                                        (end-start),len(tweet_out))
        out2 = "ETA: {}".format(time_print(time_left))
        print("{:<80s}{:<35s}".format(out1,out2))

    print("\nTotal query time: {}".format(time_print(total_loop_time)))
    return tweets_temp

###  Base segment contaning basic scrape code (not in use)

In [85]:
# Only including top tweets,
# Between a specific range of dates, with end date not inclusive
# And specifying that we want a max number of tweets in max_tweets.

# The main query word goes here, case insensitive
query = "lockdown"

tweets_df = get_tweets(query,
                        top_only = False,
                        start_date = "2020-04-01",
                        end_date = "2020-08-01",
                        max_tweets = 10)
tweets_df.head(5)

Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
0,"In a world where celebrities are worshiped, I ...",2020-07-31 23:59:57+00:00,0,0,,
1,That weed Nengi stopped blazing truly is putti...,2020-07-31 23:59:56+00:00,0,0,,#bbnaija2020lockdown
2,I truly feel sorry for the United States. Dr F...,2020-07-31 23:59:54+00:00,0,3,,
3,Our half ass lockdown has us already at 150k. ...,2020-07-31 23:59:51+00:00,0,0,,
4,They are this time around as their numbers sta...,2020-07-31 23:59:50+00:00,1,0,,


### Main segment with relevant inputs

In [86]:
activity_words = ["bought","purchased","thinking of","looking for",
"thinking about","want to buy","recommend","suggest","think of"]

product_words = ["phone","smartphone","mobile","iphone","android","samsung","nokia","galaxy","redmi","xiaomi",
                "oppo","realme","vivo","oneplus","huawei","honor","motorola"]

with open('behaviour_words.txt', 'w') as f:
    for item in activity_words:
        f.write("%s\n" % item)
    for item in product_words:
        f.write("%s\n" % item)

# Behavior words are all permutations of activity and product words
behaviour_words = list(map(" ".join, itertools.product(activity_words,product_words)))

# Tweets for each behaviour words, <1 means all possible tweets
num_tweets_keyword = 0

print("Number of activity words: {}".format(len(activity_words)))
print("Number of product words: {}".format(len(product_words)))
print("Number of permutations: {}".format(len(behaviour_words)))
print("Number of rows to be pulled: {}".format(len(behaviour_words)*num_tweets_keyword) 
      if num_tweets_keyword>0
      else "All possible tweets for each keyword will be pulled")

Number of activity words: 10
Number of product words: 17
Number of permutations: 170
All possible tweets for each keyword will be pulled


### Pulling tweets for our behaviour words

In [87]:
tweets_df_custom = custom_scraper(behaviour_words,num_tweets_keyword)

0 in 170: Query "bought phone" (60.88 secs) (Rows: 496)                         ETA: 2 hours, 52 min, 29 secs      
1 in 170: Query "bought smartphone" (3.07 secs) (Rows: 17)                      ETA: 1 hours, 30 min, 3 secs       
2 in 170: Query "bought mobile" (15.57 secs) (Rows: 120)                        ETA: 1 hours, 14 min, 12 secs      
3 in 170: Query "bought iphone" (15.97 secs) (Rows: 125)                        ETA: 1 hours, 6 min, 26 secs       
4 in 170: Query "bought android" (3.17 secs) (Rows: 20)                         ETA: 54 min, 35 secs               
5 in 170: Query "bought samsung" (9.93 secs) (Rows: 66)                         ETA: 49 min, 46 secs               
6 in 170: Query "bought nokia" (3.17 secs) (Rows: 17)                           ETA: 43 min, 38 secs               
7 in 170: Query "bought galaxy" (4.51 secs) (Rows: 26)                          ETA: 39 min, 29 secs               
8 in 170: Query "bought redmi" (5.32 secs) (Rows: 24)                   

71 in 170: Query "looking for iphone" (7.93 secs) (Rows: 47)                    ETA: 13 min, 37 secs               
72 in 170: Query "looking for android" (6.59 secs) (Rows: 41)                   ETA: 13 min, 27 secs               
73 in 170: Query "looking for samsung" (2.52 secs) (Rows: 8)                    ETA: 13 min, 11 secs               
74 in 170: Query "looking for nokia" (2.46 secs) (Rows: 3)                      ETA: 12 min, 55 secs               
75 in 170: Query "looking for galaxy" (2.86 secs) (Rows: 6)                     ETA: 12 min, 41 secs               
76 in 170: Query "looking for redmi" (2.41 secs) (Rows: 3)                      ETA: 12 min, 26 secs               
77 in 170: Query "looking for xiaomi" (1.75 secs) (Rows: 1)                     ETA: 12 min, 11 secs               
78 in 170: Query "looking for oppo" (2.13 secs) (Rows: 4)                       ETA: 11 min, 56 secs               
79 in 170: Query "looking for realme" (1.85 secs) (Rows: 3)             

142 in 170: Query "suggest nokia" (2.26 secs) (Rows: 3)                         ETA: 2 min, 41 secs                
143 in 170: Query "suggest galaxy" (2.56 secs) (Rows: 8)                        ETA: 2 min, 35 secs                
144 in 170: Query "suggest redmi" (2.27 secs) (Rows: 3)                         ETA: 2 min, 29 secs                
145 in 170: Query "suggest xiaomi" (2.95 secs) (Rows: 2)                        ETA: 2 min, 22 secs                
146 in 170: Query "suggest oppo" (3.58 secs) (Rows: 5)                          ETA: 2 min, 16 secs                
147 in 170: Query "suggest realme" (2.39 secs) (Rows: 13)                       ETA: 2 min, 10 secs                
148 in 170: Query "suggest vivo" (0.95 secs) (Rows: 0)                          ETA: 2 min, 4 secs                 
149 in 170: Query "suggest oneplus" (5.77 secs) (Rows: 17)                      ETA: 1 min, 58 secs                
150 in 170: Query "suggest huawei" (2.56 secs) (Rows: 2)                

### Tweets Dataframe as output

In [88]:
#Sorting by date
tweets_df_custom = tweets_df_custom.sort_values(by="Date", ascending=True)
print("Number of rows: {}".format(tweets_df_custom.shape[0]))
tweets_df_custom.head()

Number of rows: 5726


Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
2835,Lockdown Day 7: Banks begin loan moratorium me...,2020-04-01 00:07:13+00:00,0,0,,
2834,It’s not a surprise that people had to get gra...,2020-04-01 00:33:33+00:00,0,1,,
495,bought a new phone bc I ran out of storage for...,2020-04-01 01:17:18+00:00,0,1,,
3483,"Looking forward to ending the lockdown, Britai...",2020-04-01 01:40:23+00:00,0,0,,
494,u know i bought stardew valley on my phone bc ...,2020-04-01 02:45:03+00:00,0,0,,


In [89]:
# Removing duplicate rows
tweets_df_custom = tweets_df_custom.drop_duplicates(subset=["Text"])
print("Number of rows: {}".format(tweets_df_custom.shape[0]))

# tweets_df.to_csv("tweets_raw.csv")
tweets_df_custom.to_csv("tweets_raw_berk.csv")

Number of rows: 4750
