In [1]:
import pandas as pd
import numpy as np

In [2]:
tweets = pd.read_csv( "data/tweets-clean.csv" )
tweets.head()

Unnamed: 0,text,created_at,year,month,day,hour,minute
0,Be sure to tune in and watch Donald Trump on L...,2009-05-04 18:54:25,2009,5,4,18,54
1,Donald Trump will be appearing on The View tom...,2009-05-05 01:00:10,2009,5,5,1,0
2,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 13:38:08,2009,5,8,13,38
3,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 20:40:15,2009,5,8,20,40
4,My persona will never be that of a wallflower ...,2009-05-12 14:07:28,2009,5,12,14,7


### Group Tweets by Year and Month

In [21]:
tweets_grp = tweets.groupby( [ "year", "month" ] )

### Get Grouped Tweets by Tuple Index ( year, month )

In [22]:
# get tweet and date by tuple index
# from: https://stackoverflow.com/questions/14734533/how-to-access-pandas-groupby-dataframe-by-key
tweets_grp.get_group( ( 2009, 5 ) )[ ["text", "created_at"] ].head()

Unnamed: 0,text,created_at
0,Be sure to tune in and watch Donald Trump on L...,2009-05-04 18:54:25
1,Donald Trump will be appearing on The View tom...,2009-05-05 01:00:10
2,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 13:38:08
3,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 20:40:15
4,My persona will never be that of a wallflower ...,2009-05-12 14:07:28


### Get Tweets for One Month As List

In [26]:
tweets_one_month = tweets_grp.get_group( ( 2009, 5 ) )[ "text" ].tolist()
print( tweets_one_month[ -2 ] )

Read an excerpt from Think Like A Champion by Donald J. Trump: http://bit.ly/11FHpR


### Get List of Key Tuples

In [6]:
tweets_grp_keys = []

for year_month, grouped in tweets_grp:
    #year, month = year_month # get tuple as atomic values
    #print( year, month )
    tweets_grp_keys.append( year_month )
    
print( len( tweets_grp_keys ) )
print( tweets_grp_keys[ :7 ] )

112
[(2009, 5), (2009, 6), (2009, 7), (2009, 8), (2009, 9), (2009, 10), (2009, 11)]


### Get One Month of Tweets as List

In [7]:
one_month = tweets_grp.get_group( (2009, 5) )[ "text" ].tolist()
one_month[ 0 ]

'Be sure to tune in and watch Donald Trump on Late Night with David Letterman as he presents the Top Ten List tonight!'

### Iterate Grouped Tweets 12 Months at a Time

In [8]:
print( tweets_grp_keys[ 0 ] )
print( tweets_grp_keys[ 1 ] )
len( tweets_grp_keys )
# look at last key
tweets_grp_keys[ -1 ]

(2009, 5)
(2009, 6)


(2018, 8)

In [9]:
def get_tweets_by_windows( window_size=12, debug=False, early_exit=False, early_exit_idx=42, verbose=False ):
    
    """ Calculates and returns a list of tweets w/ dates w/in a rolling 12 month (default) window.
    NOTE: windowing stops after fill the last whole window.  Does *not* continue to create decreasingly
    less full windows of fixed length until the very last window only contains one month.  On the contrary:
    Once the right most boundary of the window reaches the "right most" element, it exits.\
    
    ¿Capiche?
    """
    
    window_start = 0
    
    months_tweets = None
    months_dates  = None

    tweets_in_window = []
    all_windows = []
    zipped_tweets = None

    counter = 0
    month_counter = 0
    tweets_key = None
    # get last key
    stop_key = tweets_grp_keys[ -1 ]
    #stop_key = ( 2018, 5 )
    
    while tweets_key != stop_key: 

        # get previously calculated key from list
        tweets_key = tweets_grp_keys[ window_start + month_counter ]
        # get one months worth of tweets...
        months_tweets = tweets_grp.get_group( tweets_key )[ "text" ].tolist()
        # ...and dates
        months_dates  = tweets_grp.get_group( tweets_key )[ "created_at" ].tolist()

        if debug and verbose: print( "window_start {}, tweets_key {}, tweet count {}".format( window_start, tweets_key, len( months_tweets ) ) )

        # zip up tweets and dates for intermediate storage
        #zipped_tweets = zip( months_dates, months_tweets )

        # append to 12 month window list...
        #tweets_in_window.append( zipped_tweets )
        tweets_in_window.append( ( months_dates, months_tweets ) )
            
        # have we finished one year's worth of months?
        # first case: our first window
        if counter == window_size - 1:
            
            window_start = 1
            month_counter = 0
            # append window to list of all windows
            all_windows.append( tweets_in_window )
            # reset window for next 12 months of tweets
            tweets_in_window = []
            if debug: print( "1) Counter {}, starting new window at {}".format( counter, window_start ) )
            
        elif ( counter > window_size and ( counter + 1 ) % window_size == 0 ):

            # reset the beginning of the window to 
            window_start = ( counter + 1 ) // window_size
            month_counter = 0

            # append window to list of all windows
            all_windows.append( tweets_in_window )
            # reset window for next 12 months of tweets
            tweets_in_window = []

            if debug: print( "2) Counter {}, starting new window at {}".format( counter, window_start ) )

        else:

            month_counter += 1

        # debuggery: allows us to test and bail early
        if early_exit and counter > early_exit_idx:
            break
        counter += 1

    return all_windows

# %time throws SyntaxError.  Wuh?!? This worked for days until now ( Thursday, December 6th @ 12:37 )
tweets_by_year = get_tweets_by_windows( debug=False )

In [27]:
def get_tweets_by_months( debug=False ):
    
    """ Calculates and returns a monthly list of tweets w/ dates"""
    
    #window_start = 0
    
    months_tweets = None
    months_dates  = None
    
    all_months = []
    zipped_tweets = None
    
    for tweets_key in tweets_grp_keys:

        # get one months worth of tweets...
        months_tweets = tweets_grp.get_group( tweets_key )[ "text" ].tolist()
        # ...and dates
        months_dates  = tweets_grp.get_group( tweets_key )[ "created_at" ].tolist()

        if debug: print( "Tweets_key {}, tweet count {}".format( tweets_key, len( months_tweets ) ) )

        # zip up tweets and dates for intermediate storage
        #zipped_tweets = zip( months_dates, months_tweets )

        # ...and to a list of months
        #all_months.append( zipped_tweets )
        all_months.append( ( months_dates, months_tweets ) )
        
    return all_months

%time tweets_by_month = get_tweets_by_months( False )

CPU times: user 94.1 ms, sys: 4.01 ms, total: 98.1 ms
Wall time: 93.6 ms


In [11]:
# #one_window = tweets_by_year[ 0 ]
# one_month = tweets_by_month[ -1 ]
# # unzips list: https://stackoverflow.com/questions/12974474/how-to-unzip-a-list-of-tuples-into-individual-lists
# one_month_list = list( zip( *one_month ) )

In [12]:
# print( one_month_list[ 0 ][ -2 ] )
# print( one_month_list[ 1 ][ -2 ] )

### Write Both Lists to File

In [28]:
import pickle

tweets_window_file_name = 'data/tweets-by-window.data'

with open( tweets_window_file_name, 'wb' ) as filehandle:  
    # store the data as binary data stream
    pickle.dump( tweets_by_year, filehandle )
    
tweets_month_file_name = 'data/tweets-by-month.data'

with open( tweets_month_file_name, 'wb' ) as filehandle:  
    # store the data as binary data stream
    pickle.dump( tweets_by_month, filehandle )
    

In [29]:
tweets_by_year = None
tweets_by_month = None

with open( tweets_window_file_name, 'rb' ) as filehandle:  
    
    # read the data as binary data stream
    tweets_by_year = pickle.load( filehandle )
    
with open( tweets_month_file_name, 'rb' ) as filehandle:  
    
    # read the data as binary data stream
    tweets_by_month = pickle.load( filehandle )
    
print( len( tweets_by_year ) )
print( len( tweets_by_month ) )

101
112
