### Import and Clean Tweets

In [43]:
import pandas as pd
import numpy as np

In [40]:
tweets = pd.read_csv( "data/tweets.csv", usecols=[ "text", "created_at" ] )
tweets.head()

Unnamed: 0,text,created_at
0,Be sure to tune in and watch Donald Trump on L...,05-04-2009 18:54:25
1,Donald Trump will be appearing on The View tom...,05-05-2009 01:00:10
2,Donald Trump reads Top Ten Financial Tips on L...,05-08-2009 13:38:08
3,New Blog Post: Celebrity Apprentice Finale and...,05-08-2009 20:40:15
4,My persona will never be that of a wallflower ...,05-12-2009 14:07:28


In [46]:
# create year & month bins, and downcast to int32
tweets.created_at = pd.to_datetime( tweets.created_at, infer_datetime_format=True, errors='coerce' )

tweets[ "year" ] = tweets.created_at.dt.year
tweets[ "month" ] = tweets.created_at.dt.month
tweets[ "day" ] = tweets.created_at.dt.day
tweets[ "hour" ] = tweets.created_at.dt.hour

tweets.month = tweets.month.astype( np.int32 )
tweets.year  = tweets.year.astype( np.int32 )
tweets.day   = tweets.day.astype( np.int32 )
tweets.hour   = tweets.hour.astype( np.int32 )

tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31632 entries, 0 to 31631
Data columns (total 6 columns):
text          31632 non-null object
created_at    31632 non-null datetime64[ns]
year          31632 non-null int32
month         31632 non-null int32
day           31632 non-null int32
hour          31632 non-null int32
dtypes: datetime64[ns](1), int32(4), object(1)
memory usage: 1.2+ MB


In [45]:
tweets.count()

text          31632
created_at    31632
year          31632
month         31632
day           31632
hour          31632
dtype: int64

In [47]:
tweets.dropna( inplace=True )

In [48]:
tweets.count()

text          31632
created_at    31632
year          31632
month         31632
day           31632
hour          31632
dtype: int64

In [75]:
tweets.count()

text     31629
date     31629
year     31629
month    31629
dtype: int64

In [27]:
# how many per year?
tweets[ 'year' ].value_counts()

2013    7173
2015    6641
2014    5340
2016    3784
2012    3190
2017    2601
2018    1940
2011     765
2010     142
2009      56
Name: year, dtype: int64

In [49]:
# how many by hour?
tweets[ 'hour' ].value_counts()

20    2586
19    2374
18    2023
21    1783
13    1767
16    1653
14    1646
12    1645
15    1614
2     1585
1     1495
17    1483
11    1458
0     1290
3     1268
22    1240
23    1174
10     946
4      713
9      474
5      470
8      348
6      324
7      273
Name: hour, dtype: int64

In [35]:
#tweets.groupby( [ "year", "month" ] ).count().reset_index()
#tweets.groupby( [ "year" ] ).count().reset_index()

tweets.groupby( ['year', 'month'] )[ 'month' ].count()

year  month
2009  5         21
      6         11
      7          5
      8          7
      9          3
      10         4
      11         3
      12         2
2010  1          4
      2          4
      3         10
      4         18
      5         17
      6         17
      7         13
      8         16
      9         13
      10         9
      11        11
      12        10
2011  1          9
      2         16
      3         36
      4          5
      5         13
      6         16
      7         73
      8         94
      9        110
      10       106
              ... 
2016  3        404
      4        251
      5        298
      6        234
      7        378
      8        299
      9        257
      10       467
      11       173
      12       138
2017  1        214
      2        154
      3        147
      4        152
      5        155
      6        212
      7        244
      8        267
      9        306
      10       290
      11       261


In [50]:
# write out as csv
tweets.drop( ['created_at'] , axis=1, inplace=True )

In [51]:
tweets.head()

Unnamed: 0,text,year,month,day,hour
0,Be sure to tune in and watch Donald Trump on L...,2009,5,4,18
1,Donald Trump will be appearing on The View tom...,2009,5,5,1
2,Donald Trump reads Top Ten Financial Tips on L...,2009,5,8,13
3,New Blog Post: Celebrity Apprentice Finale and...,2009,5,8,20
4,My persona will never be that of a wallflower ...,2009,5,12,14


In [52]:
tweets.to_csv( "data/tweets-clean.csv" )