forked from ptwobrussell/Mining-the-Social-Web
/
facebook__tag_cloud.py
87 lines (62 loc) · 2.49 KB
/
facebook__tag_cloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
import os
import sys
import urllib2
import json
import webbrowser
import nltk
from cgi import escape
from facebook__login import login
try:
ACCESS_TOKEN = open('out/facebook.access_token').read()
except IOError, e:
try:
# If you pass in the access token from the Facebook app as a command line
# parameter, be sure to wrap it in single quotes so that the shell
# doesn't interpret any characters in it. You may also need to escape the # character
ACCESS_TOKEN = sys.argv[1]
except IndexError, e:
print >> sys.stderr, \
"Could not either find access token in 'facebook.access_token' or parse args. Logging in..."
ACCESS_TOKEN = login()
BASE_URL = 'https://graph.facebook.com/me/home?access_token='
HTML_TEMPLATE = '../web_code/wp_cumulus/tagcloud_template.html'
OUT_FILE = 'out/facebook.tag_cloud.html'
NUM_PAGES = 5
MIN_FREQUENCY = 3
MIN_FONT_SIZE = 3
MAX_FONT_SIZE = 20
# Loop through the pages of connection data and build up messages
url = BASE_URL + ACCESS_TOKEN
messages = []
current_page = 0
while current_page < NUM_PAGES:
data = json.loads(urllib2.urlopen(url).read())
messages += [d['message'] for d in data['data'] if d.get('message')]
current_page += 1
url = data['paging']['next']
# Compute frequency distribution for the terms
fdist = nltk.FreqDist([term for m in messages for term in m.split()])
# Customize a list of stop words as needed
stop_words = nltk.corpus.stopwords.words('english')
stop_words += ['&', '.', '?', '!']
# Create output for the WP-Cumulus tag cloud and sort terms by freq along the way
raw_output = sorted([[escape(term), '', freq] for (term, freq) in fdist.items()
if freq > MIN_FREQUENCY and term not in stop_words],
key=lambda x: x[2])
# Implementation adapted from
# http://help.com/post/383276-anyone-knows-the-formula-for-font-s
min_freq = raw_output[0][2]
max_freq = raw_output[-1][2]
def weightTermByFreq(f):
return (f - min_freq) * (MAX_FONT_SIZE - MIN_FONT_SIZE) / (max_freq
- min_freq) + MIN_FONT_SIZE
weighted_output = [[i[0], i[1], weightTermByFreq(i[2])] for i in raw_output]
# Substitute the JSON data structure into the template
html_page = open(HTML_TEMPLATE).read() % (json.dumps(weighted_output), )
f = open(OUT_FILE, 'w')
f.write(html_page)
f.close()
print 'Date file written to: %s' % f.name
# Open up the web page in your browser
webbrowser.open('file://' + os.path.join(os.getcwd(), OUT_FILE))