In [2]:
import GetOldTweets3 as got
import pandas as pd
import itertools
import datetime
import time

In [3]:
def time_print(time_secs):
    d = datetime.datetime(1,1,1) + datetime.timedelta(seconds = time_secs)
    date_str = "{}{}{}{}".format(str(d.day-1)+" days, " if d.day>1 else "",
                                str(d.hour)+" hours, " if d.hour>0 else "",
                                str(d.minute)+" min, " if d.minute>0 else "",
                                str(d.second)+" secs")
    return date_str

### Basic GetOldTweets3 function

In [4]:
def get_tweets(query, top_only, start_date, end_date, max_tweets):
   
    # specifying tweet search criteria
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                                .setTopTweets(top_only)\
                                                .setSince(start_date)\
                                                .setUntil(end_date)\
                                                .setMaxTweets(max_tweets)
    
    # scraping tweets based on criteria
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    
    # creating list of tweets with the tweet attributes specified in the list comprehension
    text_tweets = [[tw.text,
                    tw.date,
                    tw.retweets,
                    tw.favorites,
                    tw.mentions,
                    tw.hashtags] for tw in tweet]
    
    # creating dataframe, assigning column names to list of tweets corresponding to tweet attributes
    tweets_df = pd.DataFrame(text_tweets, 
                            columns = ['Text','Date','Retweets','Favorites','Mentions','HashTags'])
    
    return tweets_df

### Custom scraper function to use with behaviour words

In [20]:
def custom_scraper(behaviour_words,limit_keyword):
    tweets_temp = pd.DataFrame(columns = ['Text','Date','Retweets','Favorites','Mentions','HashTags'])
    total_loop_time = 0
    avg_loop_time = 0
    len_behaviour_words = len(behaviour_words)
    for i,word in enumerate(behaviour_words):
        start = time.time()

        custom_query = "lockdown " + word
        tweet_out = get_tweets(custom_query,
                                top_only = False,
                                start_date = "2020-04-01",
                                end_date = "2020-08-01",
                                max_tweets = limit_keyword)
        tweets_temp = tweets_temp.append(tweet_out, ignore_index=True)

        end = time.time()
        total_loop_time += end-start
        avg_loop_time = total_loop_time/(i+1)
        time_left = (avg_loop_time) * (len_behaviour_words-i)

        out1 = "{} in {}: Query \"{}\" ({:.2f} secs) (Rows: {})".format(i,len_behaviour_words,word,
                                                                        (end-start),len(tweet_out))
        out2 = "ETA: {}".format(time_print(time_left))
        print("{:<80s}{:<35s}".format(out1,out2))

    print("\nTotal query time: {}".format(time_print(total_loop_time)))
    return tweets_temp

###  Base segment contaning basic scrape code (not in use)

In [85]:
# Only including top tweets,
# Between a specific range of dates, with end date not inclusive
# And specifying that we want a max number of tweets in max_tweets.

# The main query word goes here, case insensitive
query = "lockdown"

tweets_df = get_tweets(query,
                        top_only = False,
                        start_date = "2020-04-01",
                        end_date = "2020-08-01",
                        max_tweets = 10)
tweets_df.head(5)

Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
0,"In a world where celebrities are worshiped, I ...",2020-07-31 23:59:57+00:00,0,0,,
1,That weed Nengi stopped blazing truly is putti...,2020-07-31 23:59:56+00:00,0,0,,#bbnaija2020lockdown
2,I truly feel sorry for the United States. Dr F...,2020-07-31 23:59:54+00:00,0,3,,
3,Our half ass lockdown has us already at 150k. ...,2020-07-31 23:59:51+00:00,0,0,,
4,They are this time around as their numbers sta...,2020-07-31 23:59:50+00:00,1,0,,


### Main segment with relevant inputs

In [18]:
activity_words = ["bought","purchased","thinking of","looking for",
"thinking about","want to buy","recommend","suggest","think of"]
# ordered

product_words = ["iphone","android","samsung","nokia","galaxy","redmi","xiaomi",
                "oppo","realme","vivo","oneplus","huawei","honor","motorola"]

with open('behaviour_words.txt', 'w') as f:
    for item in activity_words:
        f.write("%s\n" % item)
    for item in product_words:
        f.write("%s\n" % item)

with open('activity_words.txt', 'w') as f:
    for item in activity_words:
        f.write("%s\n" % item)

with open('product_words.txt', 'w') as f:
    for item in product_words:
        f.write("%s\n" % item)
        
# Behavior words are all permutations of activity and product words
behaviour_words = list(map(" ".join, itertools.product(activity_words,product_words)))

# Tweets for each behaviour words, <1 means all possible tweets
num_tweets_keyword = 15

print("Number of activity words: {}".format(len(activity_words)))
print("Number of product words: {}".format(len(product_words)))
print("Number of permutations: {}".format(len(behaviour_words)))
print("Number of rows to be pulled: {}".format(len(behaviour_words)*num_tweets_keyword) 
      if num_tweets_keyword>0
      else "All possible tweets for each keyword will be pulled")

Number of activity words: 9
Number of product words: 14
Number of permutations: 126
Number of rows to be pulled: 1890


### Pulling tweets for our behaviour words

In [19]:
tweets_df_custom = custom_scraper(behaviour_words,num_tweets_keyword)

0 in 126: Query "bought iphone" (1.87 secs) (Rows: 15)                          ETA: 3 min, 55 secs                
1 in 126: Query "bought android" (1.98 secs) (Rows: 15)                         ETA: 4 min, 0 secs                 
2 in 126: Query "bought samsung" (3.03 secs) (Rows: 15)                         ETA: 4 min, 43 secs                
3 in 126: Query "bought nokia" (2.08 secs) (Rows: 15)                           ETA: 4 min, 35 secs                
4 in 126: Query "bought galaxy" (2.28 secs) (Rows: 15)                          ETA: 4 min, 33 secs                
5 in 126: Query "bought redmi" (1.53 secs) (Rows: 15)                           ETA: 4 min, 17 secs                
6 in 126: Query "bought xiaomi" (1.80 secs) (Rows: 15)                          ETA: 4 min, 9 secs                 
7 in 126: Query "bought oppo" (2.06 secs) (Rows: 15)                            ETA: 4 min, 7 secs                 
8 in 126: Query "bought realme" (2.17 secs) (Rows: 15)                  

71 in 126: Query "want to buy android" (2.20 secs) (Rows: 15)                   ETA: 2 min, 1 secs                 
72 in 126: Query "want to buy samsung" (2.24 secs) (Rows: 15)                   ETA: 1 min, 59 secs                
73 in 126: Query "want to buy nokia" (1.67 secs) (Rows: 15)                     ETA: 1 min, 57 secs                
74 in 126: Query "want to buy galaxy" (2.05 secs) (Rows: 15)                    ETA: 1 min, 54 secs                
75 in 126: Query "want to buy redmi" (2.09 secs) (Rows: 15)                     ETA: 1 min, 52 secs                
76 in 126: Query "want to buy xiaomi" (4.25 secs) (Rows: 15)                    ETA: 1 min, 51 secs                
77 in 126: Query "want to buy oppo" (1.64 secs) (Rows: 15)                      ETA: 1 min, 49 secs                
78 in 126: Query "want to buy realme" (1.59 secs) (Rows: 15)                    ETA: 1 min, 46 secs                
79 in 126: Query "want to buy vivo" (1.57 secs) (Rows: 15)              

### Tweets Dataframe as output

In [8]:
#Sorting by date
tweets_df_custom = tweets_df_custom.sort_values(by="Date", ascending=True)
print("Number of rows: {}".format(tweets_df_custom.shape[0]))
tweets_df_custom.head()

Number of rows: 1416


Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
629,"@XiaomiIndia @Xiaomi we hve ordered Mi 40"" TV ...",2020-04-01 03:59:59+00:00,0,0,@XiaomiIndia @Xiaomi,
532,@OnlineGpsc I had purchased my online class fr...,2020-04-01 05:48:42+00:00,0,0,@OnlineGpsc,
209,@SamsungIndia @Samsung @amazon Worst experienc...,2020-04-01 06:05:48+00:00,0,0,@SamsungIndia @Samsung @amazon @amazon,
628,"@Xiaomi Mi A2 Packed on 2018, July Purchased O...",2020-04-01 06:27:55+00:00,0,0,@Xiaomi,
688,@flipkartsupport recently just before the coun...,2020-04-01 08:17:43+00:00,0,0,@flipkartsupport,


In [9]:
# Removing duplicate rows
tweets_df_custom = tweets_df_custom.drop_duplicates(subset=["Text"])
print("Number of rows: {}".format(tweets_df_custom.shape[0]))

# tweets_df.to_csv("tweets_raw.csv")
tweets_df_custom.to_csv("tweets_raw.csv")

Number of rows: 1217
