In [1]:
import tweepy
import pandas as pd
import numpy as np
import os, re
import json
from textblob import TextBlob
from datetime import date, datetime
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

### Twitter API Configuration

In [2]:
auth = tweepy.OAuthHandler("", "")
auth.set_access_token("", "")
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
#api = tweepy.API(auth)

In [3]:
input_path = 'D:\\Data\\Box-Office-Forecasting'
data = pd.read_csv(os.path.join(input_path, 'all-not-null.csv'), header=0, sep=';', engine='python', encoding= 'utf8')
movie_names = data.name.unique()

# hard to determine relavance of the tweet
removed_movies = ['Up', 'Made', 'Life', 'LOL', 'Yes', 'Super', 'Beautiful', 'Red', 'Final', 'Stay',
                  'May', 'War', 'Deal', '9', 'Special', 'Alone', 'O', 'Committed', 'Brother'] 

movie_names = [x for x in movie_names if x not in removed_movies]

### Search Movies

In [4]:
def save_tweet_ids(tweet_id_min_max):
    with open('D:\Data\Twitter\\tweet_id_min_max.json', 'w') as fp:
        json.dump(tweet_id_min_max, fp)

def get_tweet_ids():
    with open('D:\Data\Twitter\\tweet_id_min_max.json') as f:
        tweet_id_min_max = json.load(f)
    return tweet_id_min_max
    
def max_tweet_id(tweet_id_min_max, movie):
    try: 
        return tweet_id_min_max[movie]['max_id']
    except:
        return 0
    
def get_tweets(movie, max_id, last_batch_min_id):
    # We are going backwards in the timeline. Getting tweets before the last batch's oldest tweet.
    tweets = []
    
    if last_batch_min_id == 0:
        if max_id == 0:
            tweets = api.search(q= '"' + movie + '"' +' movie', count=100, lang='en', tweet_mode='extended')
        else:
            tweets = api.search(q='"' + movie + '"' +' movie', count=100, lang='en', since_id=max_id, tweet_mode='extended')
    else:
        tweets = api.search(q='"' + movie + '"' +' movie', count=100, lang='en', since_id=max_id, max_id=last_batch_min_id,  tweet_mode='extended')
    
    return tweets

In [5]:
ids = []
movies = []
tweet_texts = []
tweet_dates = []
for movie in movie_names:
    tweet_id_min_max = get_tweet_ids()
    max_id = max_tweet_id(tweet_id_min_max, movie)
    last_batch_min_id = 0
    movie_ids=[]
    while True:  
        tweets = get_tweets(movie, max_id, last_batch_min_id)
        if len(tweets) <= 1:
            #print("Movie Name: {0}, No more new tweets!".format(movie))
            break
        
        #print("Movie Name: {0}, #Tweets before {1}: {2}".format(movie, last_batch_min_id, len(tweets)))
        for tweet in tweets:
            movies.append(movie)
            movie_ids.append(tweet.id)
            ids.append(tweet.id)
            tweet_texts.append(tweet.text)
            tweet_dates.append(tweet.created_at)
        last_batch_min_id = min(movie_ids)
    
    print("Movie Name: {0}, #Tweets: {1}, Time: {2}".format(movie, len(movie_ids), str(datetime.now())))
    if len(movie_ids) > 0:
        max_id = max(movie_ids)
    tweet_id_min_max[movie] = {'max_id':max_id}
    save_tweet_ids(tweet_id_min_max)

Movie Name: Cast Away, #Tweets: 12, Time: 2019-03-07 17:31:34.760982
Movie Name: Gladiator, #Tweets: 58, Time: 2019-03-07 17:31:35.988532
Movie Name: American Psycho, #Tweets: 32, Time: 2019-03-07 17:31:37.177908
Movie Name: Miss Congeniality, #Tweets: 5, Time: 2019-03-07 17:31:38.272932
Movie Name: Bring It On, #Tweets: 120, Time: 2019-03-07 17:31:40.525828
Movie Name: Hollow Man, #Tweets: 11, Time: 2019-03-07 17:31:41.874300
Movie Name: Pitch Black, #Tweets: 9, Time: 2019-03-07 17:31:43.176998
Movie Name: Reindeer Games, #Tweets: 2, Time: 2019-03-07 17:31:44.160525
Movie Name: Battlefield Earth, #Tweets: 11, Time: 2019-03-07 17:31:45.345265
Movie Name: Dancer in the Dark, #Tweets: 4, Time: 2019-03-07 17:31:46.215973
Movie Name: Best in Show, #Tweets: 0, Time: 2019-03-07 17:31:46.879406
Movie Name: High Fidelity, #Tweets: 3, Time: 2019-03-07 17:31:47.900293
Movie Name: Bedazzled, #Tweets: 6, Time: 2019-03-07 17:31:48.869885
Movie Name: Dinosaur, #Tweets: 65, Time: 2019-03-07 17:31:50.

Rate limit reached. Sleeping for: 790


KeyboardInterrupt: 

In [None]:
tweets_df = pd.DataFrame({'movie':movies,
                          'tweet_id':ids,
                          'text': tweet_texts,
                          'date': tweet_dates})
len(tweets_df)

### Pre-processing

In [None]:
def remove_pattern(text, pattern_regex):
    r = re.findall(pattern_regex, text)
    for i in r:
        text = re.sub(i, '', text)
    
    return text

# We are keeping cleaned tweets in a new column called 'tidy_tweets'
tweets_df['text'] = np.vectorize(remove_pattern)(tweets_df['text'], "@[\w]* | @[\w:]* | *RT* ")

#### Removing links (http | https), strange characters duplicates

In [None]:
cleaned_tweets = []

for index, row in tweets_df.iterrows():
    # Here we are filtering out all the words that contains link
    words_without_links = [word for word in row.text.split()        if 'http' not in word]
    cleaned_tweets.append(' '.join(words_without_links))

tweets_df['text'] = cleaned_tweets

In [None]:
tweets_df['text'] = tweets_df['text'].str.replace("[^a-zA-Z#.,;'!? ]", "")
tweets_df = tweets_df.drop_duplicates(subset=['text'], keep=False)

#### Save

In [None]:
output_path = 'D:\\Data\\Twitter'
tweets_df.to_csv(os.path.join(output_path, 'tweets_' + datetime.now().strftime("%Y-%m-%d") + '.csv'), 
                 sep='~', mode='a', encoding= 'utf8', index=False) #appends