Skip to content
Browse files

Adding changes to README and documentation.

  • Loading branch information...
1 parent 9bdfef5 commit da721f4cf874b66e553308bde17f3e4ce665b0b7 @dbalasuriya committed Jun 6, 2012
Showing with 73 additions and 6 deletions.
  1. +40 −0 README
  2. +28 −6 analyse_tweets.py
  3. +5 −0 load_tweets.py
View
40 README
@@ -0,0 +1,40 @@
+Data Analysis for the #takemymoneyHBO trend on Twitter
+=====================================================
+
+I was curious about what the average amount of money
+people would be willing to pay for a standalone
+HBOGO streaming service, so I wrote two small Python
+scripts that use the Twitter search API to retrieve
+the 1500 most recent tweets, store them in a Python
+shelf, and then analyse them to determine the average
+amount.
+
+There are limitations to this approach, since there are
+certainly more than 1500 tweets with this hashtag. I
+made the following decisions about how I handled the data:
+
+* RTs were ignored, because I'm interested in each person's
+ personal opinion.
+* I looked for the phrase 'pay $' in the tweet, and extracted
+ the number following the '$'. If there was no number following,
+ the tweet was ignored (e.g. some people tweeted statements
+ like 'I would pay $$$'.
+* Money amounts >$50 were ignore, since some people tweeted
+ statements like 'I would pay $1000000'.
+
+Currently the script only returns the average amount and the
+number of data points available of the 1500 downloaded (after
+RTs, etc are removed).
+
+
+Results
+=======
+
+Wednesday 5:10AM GMT/UTC +0:00 - $12.06, from 1063 data points.
+
+Remix This
+==========
+These (very simple) scripts are released under the CC-BY license,
+so download them, run them yourself, and modify them to extract
+more interesting data - e.g. draw graphs.
+
View
34 analyse_tweets.py
@@ -1,28 +1,50 @@
import shelve
import re
+'''
+Opens the python shelf created by load_tweets.py
+and removes the money data from each tweet, verifying
+that each tweet is not a RT, contains 'pay $x' in it,
+and is not a repeated data point. Amounts over $50 are
+ignored, because some tweets contain statements like
+'I would pay $100000'.
+
+RTs are ignored because we want each person's individual
+opinion.
+'''
+
+
+money_re = re.compile('pay \$([^ ]*) ')
+
def get_average(tweet_dict):
return sum(tweet_dict.itervalues())/len(tweet_dict)
analysis_dict = {}
-money_re = re.compile('pay \$([^ ]*) ')
-
shelf = shelve.open('hbotweets.dat')
-
for tweet in shelf['tweets']:
+ # check the tweet is not a RT, and contains 'pay $'
if 'RT' not in tweet['text'] and 'pay $' in tweet['text']:
money_text = money_re.findall(tweet['text'])
+ # try to extract the money amount, but discard
+ # any failures, since some tweets contain
+ # statements like 'I would pay $$$$'
try:
+ # we read the first money amount in the
+ # tweet; there should only be one.
money = float(money_text[0])
- if money < 100:
+
+ # disregard any money amounts > $50
+ if money < 50:
+ # a dictionary is used to store the tweets
+ # with the tweet_id as a key, avoiding any
+ # duplicates in the search/loading process
analysis_dict[tweet['id']] = money
+
except ValueError:
continue
-print 'original points:'
-print len(shelf['tweets'])
print 'Data points:'
print len(analysis_dict)
print 'Average:'
View
5 load_tweets.py
@@ -2,6 +2,11 @@
import shelve
import requests
+'''
+Loads the most recent 1500 tweets matching the search result
+#takemymoneyHBO and stores them in a python shelf.
+'''
+
shelf = shelve.open('hbotweets.dat')
shelf['tweets'] = []

0 comments on commit da721f4

Please sign in to comment.
Something went wrong with that request. Please try again.