In [1]:
import GetOldTweets3 as got
import pandas as pd
import itertools
import datetime
import time

## Custom functions

In [2]:
def time_print(time_secs):
    d = datetime.datetime(1,1,1) + datetime.timedelta(seconds = time_secs)
    date_str = "{}{}{}{}".format(str(d.day-1)+" days, " if d.day>1 else "",
                                str(d.hour)+" hours, " if d.hour>0 else "",
                                str(d.minute)+" min, " if d.minute>0 else "",
                                str(d.second)+" secs")
    return date_str

def keywords_prep_part1(activity_words,product_words,n):
    
    temp_set = set([j for i in itertools.chain(activity_words,product_words) for j in i.split()])
    
    with open('sentiment_words.txt', 'w') as f:
        for item in list(temp_set):
            f.write("%s\n" % item)
    
    # Behavior words are all permutations of activity and product words
    bwords = list(map(" ".join, itertools.product(activity_words,product_words)))

    print("Number of activity words: {}".format(len(activity_words)))
    print("Number of product words: {}".format(len(product_words)))
    print("Number of permutations: {}".format(len(bwords)))
    print("Number of max rows to be pulled: {}".format(len(bwords)*n) 
          if n>0
          else "All possible tweets for each keyword will be pulled")
    return bwords

## GetOldTweets3 enabler function

In [4]:
def get_tweets(query, top_only, start_date, end_date, max_tweets):
   
    # specifying tweet search criteria
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                                .setTopTweets(top_only)\
                                                .setSince(start_date)\
                                                .setUntil(end_date)\
                                                .setMaxTweets(max_tweets)
    
    # scraping tweets based on criteria
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    
    # creating list of tweets with the tweet attributes specified in the list comprehension
    text_tweets = [[tw.text,
                    tw.date,
                    tw.retweets,
                    tw.favorites,
                    tw.mentions,
                    tw.hashtags] for tw in tweet]
    
    # creating dataframe, assigning column names to list of tweets corresponding to tweet attributes
    tweets_df = pd.DataFrame(text_tweets, 
                            columns = ['Text','Date','Retweets','Favorites','Mentions','HashTags'])
    
    return tweets_df

## Scraper function for custom words

In [5]:
def custom_scraper(queries,fixedk,limit_keyword):
    tweets_temp = pd.DataFrame(columns = ['Text','Date','Retweets','Favorites','Mentions','HashTags'])
    total_loop_time = 0
    avg_loop_time = 0
    len_queries = len(queries)
    for i,word in enumerate(queries):
        start = time.time()

        custom_query = " ".join(fixedk)+" "+word
        tweet_out = get_tweets(custom_query,
                                top_only = False,
                                start_date = "2020-04-01",
                                end_date = "2020-08-01",
                                max_tweets = limit_keyword)
        tweets_temp = tweets_temp.append(tweet_out, ignore_index=True)

        end = time.time()
        total_loop_time += end-start
        avg_loop_time = total_loop_time/(i+1)
        time_left = (avg_loop_time) * (len_queries-i)

        out1 = "{} in {}: Query \"{}\" ({:.2f} secs) (Rows: {})".format(i,len_queries,custom_query,
                                                                        (end-start),len(tweet_out))
        out2 = "ETA: {}".format(time_print(time_left))
        print("{:<80s}{:<35s}".format(out1,out2))

    print("\nTotal query time: {}".format(time_print(total_loop_time)))
    return tweets_temp

##  Basic scrape code (not in use)

In [None]:
# Between a specific range of dates, with end date not inclusive
# And specifying that we want a max number of tweets in max_tweets.

# The main query word goes here, case insensitive
query = "lockdown"
tweets_df = get_tweets(query,
                        top_only = False,
                        start_date = "2020-04-01",
                        end_date = "2020-08-01",
                        max_tweets = 1)
tweets_df.head(5)

## Pulling for Part 1

In [3]:
activity_words = []

with open("useful_sentiments_small.txt","r") as f:
    lines = [line.strip() for line in f]
for i in lines:
    activity_words.append(i)

product_words = ["zomato","swiggy"]

# Tweets for each query words, <1 means all possible tweets
tweets_per_keyword = 0

sentiment_words = keywords_prep_part1(activity_words,product_words,tweets_per_keyword)
sentiment_words[:5]

Number of activity words: 146
Number of product words: 2
Number of permutations: 292
All possible tweets for each keyword will be pulled


['accessible zomato',
 'accessible swiggy',
 'advantage zomato',
 'advantage swiggy',
 'advantages zomato']

In [37]:
fixed_key = ["lockdown"]
tweets_sentiment = custom_scraper(sentiment_words,fixed_key,tweets_per_keyword)

0 in 310: Query "lockdown a+ zomato" (89.94 secs) (Rows: 455)                   ETA: 7 hours, 44 min, 41 secs      
1 in 310: Query "lockdown a+ swiggy" (102.32 secs) (Rows: 610)                  ETA: 8 hours, 15 min, 4 secs       
2 in 310: Query "lockdown accessible zomato" (2.86 secs) (Rows: 1)              ETA: 5 hours, 33 min, 52 secs      
3 in 310: Query "lockdown accessible swiggy" (2.46 secs) (Rows: 1)              ETA: 4 hours, 12 min, 44 secs      
4 in 310: Query "lockdown advantage zomato" (4.28 secs) (Rows: 20)              ETA: 3 hours, 25 min, 54 secs      
5 in 310: Query "lockdown advantage swiggy" (6.37 secs) (Rows: 26)              ETA: 2 hours, 56 min, 25 secs      
6 in 310: Query "lockdown advantages zomato" (5.26 secs) (Rows: 24)             ETA: 2 hours, 34 min, 31 secs      
7 in 310: Query "lockdown advantages swiggy" (6.61 secs) (Rows: 27)             ETA: 2 hours, 18 min, 56 secs      
8 in 310: Query "lockdown afraid zomato" (2.77 secs) (Rows: 1)          

71 in 310: Query "lockdown crazy swiggy" (3.10 secs) (Rows: 4)                  ETA: 20 min, 26 secs               
72 in 310: Query "lockdown defect zomato" (1.28 secs) (Rows: 0)                 ETA: 20 min, 8 secs                
73 in 310: Query "lockdown defect swiggy" (0.99 secs) (Rows: 0)                 ETA: 19 min, 50 secs               
74 in 310: Query "lockdown defective zomato" (2.77 secs) (Rows: 1)              ETA: 19 min, 38 secs               
75 in 310: Query "lockdown defective swiggy" (1.17 secs) (Rows: 0)              ETA: 19 min, 21 secs               
76 in 310: Query "lockdown delay zomato" (2.46 secs) (Rows: 1)                  ETA: 19 min, 9 secs                
77 in 310: Query "lockdown delay swiggy" (3.28 secs) (Rows: 6)                  ETA: 18 min, 59 secs               
78 in 310: Query "lockdown delayed zomato" (2.46 secs) (Rows: 1)                ETA: 18 min, 47 secs               
79 in 310: Query "lockdown delayed swiggy" (3.07 secs) (Rows: 3)        

142 in 310: Query "lockdown fair zomato" (2.75 secs) (Rows: 1)                  ETA: 9 min, 52 secs                
143 in 310: Query "lockdown fair swiggy" (2.67 secs) (Rows: 1)                  ETA: 9 min, 48 secs                
144 in 310: Query "lockdown fantastic zomato" (1.23 secs) (Rows: 0)             ETA: 9 min, 41 secs                
145 in 310: Query "lockdown fantastic swiggy" (2.77 secs) (Rows: 3)             ETA: 9 min, 37 secs                
146 in 310: Query "lockdown fast zomato" (6.08 secs) (Rows: 6)                  ETA: 9 min, 36 secs                
147 in 310: Query "lockdown fast swiggy" (3.85 secs) (Rows: 10)                 ETA: 9 min, 33 secs                
148 in 310: Query "lockdown faster zomato" (3.10 secs) (Rows: 1)                ETA: 9 min, 29 secs                
149 in 310: Query "lockdown faster swiggy" (2.14 secs) (Rows: 0)                ETA: 9 min, 24 secs                
150 in 310: Query "lockdown friendly zomato" (1.07 secs) (Rows: 0)      

213 in 310: Query "lockdown lost swiggy" (3.42 secs) (Rows: 10)                 ETA: 5 min, 11 secs                
214 in 310: Query "lockdown mad zomato" (1.26 secs) (Rows: 0)                   ETA: 5 min, 6 secs                 
215 in 310: Query "lockdown mad swiggy" (1.31 secs) (Rows: 0)                   ETA: 5 min, 2 secs                 
216 in 310: Query "lockdown maddening zomato" (1.42 secs) (Rows: 0)             ETA: 4 min, 58 secs                
217 in 310: Query "lockdown maddening swiggy" (1.27 secs) (Rows: 0)             ETA: 4 min, 54 secs                
218 in 310: Query "lockdown miserable zomato" (3.04 secs) (Rows: 1)             ETA: 4 min, 51 secs                
219 in 310: Query "lockdown miserable swiggy" (1.17 secs) (Rows: 0)             ETA: 4 min, 47 secs                
220 in 310: Query "lockdown nasty zomato" (2.50 secs) (Rows: 1)                 ETA: 4 min, 44 secs                
221 in 310: Query "lockdown nasty swiggy" (2.68 secs) (Rows: 2)         

284 in 310: Query "lockdown stupid zomato" (2.93 secs) (Rows: 1)                ETA: 1 min, 18 secs                
285 in 310: Query "lockdown stupid swiggy" (2.70 secs) (Rows: 2)                ETA: 1 min, 15 secs                
286 in 310: Query "lockdown stupidest zomato" (2.42 secs) (Rows: 1)             ETA: 1 min, 12 secs                
287 in 310: Query "lockdown stupidest swiggy" (1.19 secs) (Rows: 0)             ETA: 1 min, 9 secs                 
288 in 310: Query "lockdown stupidity zomato" (1.07 secs) (Rows: 0)             ETA: 1 min, 6 secs                 
289 in 310: Query "lockdown stupidity swiggy" (2.66 secs) (Rows: 1)             ETA: 1 min, 3 secs                 
290 in 310: Query "lockdown surprise zomato" (2.54 secs) (Rows: 2)              ETA: 1 min, 0 secs                 
291 in 310: Query "lockdown surprise swiggy" (2.50 secs) (Rows: 7)              ETA: 57 secs                       
292 in 310: Query "lockdown surprised zomato" (3.04 secs) (Rows: 6)     

In [38]:
#Sorting by date
tweets_sentiment.sort_values(by="Date", inplace=True, ascending=True)
print("Number of rows: {}".format(tweets_sentiment.shape[0]))

tweets_sentiment.drop_duplicates(subset=["Text"], inplace=True)
tweets_sentiment.reset_index(drop=True, inplace=True)
print("Number of rows after deleted duplicates: {}".format(tweets_sentiment.shape[0]))

tweets_sentiment.to_csv("tweets_sentiment.csv")
tweets_sentiment.head(5)

Number of rows: 2157
Number of rows after deleted duplicates: 1383


Unnamed: 0,Text,Date,Retweets,Favorites,Mentions,HashTags
0,Things realized during Lockdown: 1. Cooking is...,2020-04-01 05:10:32+00:00,0,0,,
1,"I received two messages today, one from @swigg...",2020-04-01 05:13:18+00:00,0,6,@swiggy_in @NetMeds,
2,Whats wrong with zomato i am unable to order a...,2020-04-01 05:19:26+00:00,0,0,@zomatocare @Zomato @ZomatoIN @iamsrk,#Lockdown21 #lockdownindia
3,@MumbaiPolice @ThaneCityPolice It’s not milita...,2020-04-01 09:22:14+00:00,0,0,@MumbaiPolice @ThaneCityPolice @Zomato,
4,#AskZee sir I am a zomato delivery boy. due lo...,2020-04-01 09:23:03+00:00,0,0,,#AskZee
