In [1]:
import pandas as pd
import numpy as np

In [2]:
tweets = pd.read_csv( "data/tweets-clean.csv" )
tweets.head()

Unnamed: 0,text,created_at,year,month,day,hour,minute
0,Be sure to tune in and watch Donald Trump on L...,2009-05-04 18:54:25,2009,5,4,18,54
1,Donald Trump will be appearing on The View tom...,2009-05-05 01:00:10,2009,5,5,1,0
2,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 13:38:08,2009,5,8,13,38
3,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 20:40:15,2009,5,8,20,40
4,My persona will never be that of a wallflower ...,2009-05-12 14:07:28,2009,5,12,14,7


### Group Tweets by Year and Month

In [3]:
tweets_grp = tweets.groupby( [ "year", "month" ] )

### Get Grouped Tweets by Tuple Index ( year, month )

In [4]:
# get tweet and date by tuple index
# from: https://stackoverflow.com/questions/14734533/how-to-access-pandas-groupby-dataframe-by-key
tweets_grp.get_group( ( 2009, 5 ) )[ ["text", "created_at"] ].head()

Unnamed: 0,text,created_at
0,Be sure to tune in and watch Donald Trump on L...,2009-05-04 18:54:25
1,Donald Trump will be appearing on The View tom...,2009-05-05 01:00:10
2,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 13:38:08
3,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 20:40:15
4,My persona will never be that of a wallflower ...,2009-05-12 14:07:28


### Get Tweets for One Month As List

In [5]:
tweets_one_month = tweets_grp.get_group( ( 2009, 5 ) )[ "text" ].tolist()
tweets_one_month[ -1 ]

'You have to know when to call it quits and when to keep moving forward. --Donald J. Trump http://www.trumpthinklikeachampion.com'

### Get List of Keys (Tuples)

In [6]:
tweets_grp_keys = []

for year_month, grouped in tweets_grp:
    #year, month = year_month # get tuple as atomic values
    #print( year, month )
    tweets_grp_keys.append( year_month )
    
print( len( tweets_grp_keys ) )
print( tweets_grp_keys[ :7 ] )

112
[(2009, 5), (2009, 6), (2009, 7), (2009, 8), (2009, 9), (2009, 10), (2009, 11)]


### Get One Month of Tweets as List

In [7]:
one_month = tweets_grp.get_group( (2009, 5) )[ "text" ].tolist()
one_month[ 0 ]

'Be sure to tune in and watch Donald Trump on Late Night with David Letterman as he presents the Top Ten List tonight!'

### Iterate Grouped Tweets 12 Months at a Time

In [43]:
print( tweets_grp_keys[ 0 ] )
print( tweets_grp_keys[ 1 ] )
len( tweets_grp_keys )

(2009, 5)
(2009, 6)


112

In [47]:
stop_key = tweets_grp_keys[ -1 ]
stop_key

(2018, 8)

In [92]:
def get_tweets_by_windows( window_size=12, debug=False, verbose=False ):
    
    """ Calculates and returns a list of tweets w/ dates w/in a rolling 12 month window"""
    
    window_start = 0
    
    months_tweets = None
    months_dates  = None

    tweets_in_window = []
    all_windows = []
    #all_months = []
    zipped_tweets = None

    counter = 0
    month_counter = 0
    tweets_key = None
    # get last key
    stop_key = tweets_grp_keys[ -1 ]
    
    while tweets_key != stop_key: 

        # get previously calculated key from list
        tweets_key = tweets_grp_keys[ window_start + month_counter ]
        # get one months worth of tweets...
        months_tweets = tweets_grp.get_group( tweets_key )[ "text" ].tolist()
        # ...and dates
        months_dates  = tweets_grp.get_group( tweets_key )[ "created_at" ].tolist()

        if debug and verbose: print( "window_start {}, tweets_key {}, tweet count {}".format( window_start, tweets_key, len( months_tweets ) ) )

        # zip up tweets and dates for intermediate storage
        zipped_tweets = zip( months_dates, months_tweets )

        # append to 12 month window list...
        tweets_in_window.append( zipped_tweets )
        
        # have we finished one year's worth of months?
        # first case: our first window
        if counter == window_size - 1:
            
            window_start = 1
            month_counter = 0
            # append window to list of all windows
            all_windows.append( tweets_in_window )
            # reset window for next 12 months of tweets
            tweets_in_window = []
            if debug: print( "1) Counter {}, starting new window at {}".format( counter, window_start ) )
            
        elif ( counter > window_size and ( counter + 1 ) % window_size == 0 ):

            # reset the beginning of the window to 
            window_start = ( counter + 1 ) // window_size
            month_counter = 0

            # append window to list of all windows
            all_windows.append( tweets_in_window )
            # reset window for next 12 months of tweets
            tweets_in_window = []

            if debug: print( "2) Counter {}, starting new window at {}".format( counter, window_start ) )
        
        else:
            
            month_counter += 1
        
#         if counter > 49:
#             break
        counter += 1
        
        
    return all_windows

%time tweets_by_year = get_tweets_by_windows( debug=False )



CPU times: user 935 ms, sys: 11 ms, total: 946 ms
Wall time: 971 ms


In [98]:
def get_tweets_by_months( debug=False ):
    
    """ Calculates and returns a monthly list of tweets w/ dates"""
    
    #window_start = 0
    
    months_tweets = None
    months_dates  = None
    
    all_months = []
    zipped_tweets = None
    
    for tweets_key in tweets_grp_keys:

        # get one months worth of tweets...
        months_tweets = tweets_grp.get_group( tweets_key )[ "text" ].tolist()
        # ...and dates
        months_dates  = tweets_grp.get_group( tweets_key )[ "created_at" ].tolist()

        if debug: print( "Tweets_key {}, tweet count {}".format( tweets_key, len( months_tweets ) ) )

        # zip up tweets and dates for intermediate storage
        zipped_tweets = zip( months_dates, months_tweets )

        # ...and to a list of months
        all_months.append( zipped_tweets )
        
    return all_months

%time tweets_by_month = get_tweets_by_months( False )



CPU times: user 94.2 ms, sys: 6.47 ms, total: 101 ms
Wall time: 101 ms


In [89]:
last_year = tweets_by_year[ -1 ]
last_month = last_year[ -1 ]
list( zip( *last_month ) 
#dates, tweets = zip( *last_month )
#dates, tweets = zip( *last_year[ -1 ] )

SyntaxError: unexpected EOF while parsing (<ipython-input-89-52d4d6d5266c>, line 5)

In [99]:
#one_window = tweets_by_year[ 0 ]
one_month = tweets_by_month[ 0 ]
one_month_list = list( zip( *one_month ) )
one_month_list
# unzips list: https://stackoverflow.com/questions/12974474/how-to-unzip-a-list-of-tuples-into-individual-lists
# dates, months = zip( *one_month )
# dates

[('2009-05-04 18:54:25',
  '2009-05-05 01:00:10',
  '2009-05-08 13:38:08',
  '2009-05-08 20:40:15',
  '2009-05-12 14:07:28',
  '2009-05-12 19:21:55',
  '2009-05-13 17:38:28',
  '2009-05-14 16:30:40',
  '2009-05-15 14:13:13',
  '2009-05-16 22:22:45',
  '2009-05-17 15:00:03',
  '2009-05-18 14:26:00',
  '2009-05-19 17:43:39',
  '2009-05-20 13:25:39',
  '2009-05-20 22:29:47',
  '2009-05-22 02:59:39',
  '2009-05-22 16:28:34',
  '2009-05-23 16:11:19',
  '2009-05-26 14:42:01',
  '2009-05-27 14:18:52',
  '2009-05-28 18:03:34'),
 ('Be sure to tune in and watch Donald Trump on Late Night with David Letterman as he presents the Top Ten List tonight!',
  'Donald Trump will be appearing on The View tomorrow morning to discuss Celebrity Apprentice and his new book Think Like A Champion!',
  'Donald Trump reads Top Ten Financial Tips on Late Show with David Letterman: http://tinyurl.com/ooafwn - Very funny!',
  'New Blog Post: Celebrity Apprentice Finale and Lessons Learned Along the Way: http://tiny

In [104]:
print( one_month_list[ 0 ][ 0 ] )
print( one_month_list[ 1 ][ 0 ] )

2009-05-04 18:54:25
Be sure to tune in and watch Donald Trump on Late Night with David Letterman as he presents the Top Ten List tonight!


In [93]:
tweets_one_month = None

tweets_grp_keys = []
for year_month, grouped in tweets_grp:
    #year, month = year_month
    #print( year, month )
    tweets_grp_keys.append( year_month )
    
print( len( tweets_grp_keys ) )
print( type( tweets_grp ) )

print( tweets_grp_keys )

# for month, grouped in tweets_grp:
#     #print( grouped.head() )
#     print( month )
#     tweets_one_month = grouped[ [ "text", "created_at" ] ]
#     print( tweets_one_month.head() )
    
    
    
    

112
<class 'pandas.core.groupby.DataFrameGroupBy'>
[(2009, 5), (2009, 6), (2009, 7), (2009, 8), (2009, 9), (2009, 10), (2009, 11), (2009, 12), (2010, 1), (2010, 2), (2010, 3), (2010, 4), (2010, 5), (2010, 6), (2010, 7), (2010, 8), (2010, 9), (2010, 10), (2010, 11), (2010, 12), (2011, 1), (2011, 2), (2011, 3), (2011, 4), (2011, 5), (2011, 6), (2011, 7), (2011, 8), (2011, 9), (2011, 10), (2011, 11), (2011, 12), (2012, 1), (2012, 2), (2012, 3), (2012, 4), (2012, 5), (2012, 6), (2012, 7), (2012, 8), (2012, 9), (2012, 10), (2012, 11), (2012, 12), (2013, 1), (2013, 2), (2013, 3), (2013, 4), (2013, 5), (2013, 6), (2013, 7), (2013, 8), (2013, 9), (2013, 10), (2013, 11), (2013, 12), (2014, 1), (2014, 2), (2014, 3), (2014, 4), (2014, 5), (2014, 6), (2014, 7), (2014, 8), (2014, 9), (2014, 10), (2014, 11), (2014, 12), (2015, 1), (2015, 2), (2015, 3), (2015, 4), (2015, 5), (2015, 6), (2015, 7), (2015, 8), (2015, 9), (2015, 10), (2015, 11), (2015, 12), (2016, 1), (2016, 2), (2016, 3), (2016, 4), (20