占领华尔街事件中的twitter传播特点

甄子锐

数据清洗

In [2]:
bigfile = open('/Users/abner/github/data/ows-raw.txt', 'rb')
chunkSize = 10000000
chunk = bigfile.readlines(chunkSize)
block = ''
with open("/Users/abner/github/data/ows-raw_clean.txt", 'w') as f:
    while chunk:
        blocks = []
        for line in chunk:
            block += line.replace('\n', '')
            block_length = len(block.split(','))
            if block_length >= 14:
                blocks.append(block)
                block = ''
        for i in blocks:
            f.write(i + '\n')
        chunk = bigfile.readlines(chunkSize)
In [3]:
from collections import defaultdict
import csv

data_dict = defaultdict(list)

error_num = 0
line_num = 0
total_num = 0

bigfile = open('/Users/abner/github/data/ows-raw_clean.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)
while chunk:
    total_num += len(chunk)
    lines = csv.reader((line.replace('\x00','') for line in chunk), delimiter=',', quotechar='"')
    for i in lines:
        line_num +=1
        try:
            date = i[3]
            if len(date) == 10:#正确的行被计入
                data_dict[date].append(i[8])
            else:
                error_num+=1
        except:
            pass
    chunk = bigfile.readlines(chunkSize)
print line_num, total_num, error_num
6602141 6602186 21
In [4]:
" , ".join(data_dict.keys())
Out[4]:
'2012-01-23 , 2011-10-25 , 2011-10-24 , 2011-12-10 , 2011-12-11 , 2011-12-12 , 2011-12-13 , 2011-12-14 , 2011-12-15 , 2011-12-16 , 2011-12-17 , 2011-12-18 , 2011-12-19 , 2012-02-16 , 2012-02-17 , 2012-02-14 , 2012-02-15 , 2012-02-12 , 2012-02-13 , 2012-02-10 , 2012-02-11 , 2012-02-18 , 2011-12-07 , 2011-12-06 , 2011-12-05 , 2011-12-04 , 2011-12-03 , 2011-12-02 , 2011-12-01 , 2011-11-20 , 2011-11-21 , 2011-11-22 , 2011-11-23 , 2011-11-24 , 2011-11-25 , 2011-12-09 , 2011-12-08 , 2012-02-05 , 2012-02-04 , 2012-02-07 , 2012-02-06 , 2012-02-01 , 2012-02-03 , 2012-02-02 , 2012-02-09 , 2012-02-08 , 2011-11-11 , 2011-10-31 , 2011-11-28 , 2012-01-28 , 2011-11-29 , 2012-01-26 , 2012-01-27 , 2012-01-24 , 2012-01-25 , 2012-01-22 , 2011-10-29 , 2012-01-20 , 2012-01-21 , 2011-10-28 , 2011-11-30 , 2011-10-23 , 2011-10-22 , 2011-10-21 , 2011-10-20 , 2011-10-27 , 2012-01-31 , 2012-01-30 , 2011-10-26 , 2011-11-26 , 2011-11-27 , 2012-01-01 , 2012-01-02 , 2012-01-03 , 2012-01-04 , 2012-01-05 , 2012-01-06 , 2012-01-07 , 2012-01-08 , 2012-01-09 , 2011-11-01 , 2011-10-12 , 2011-10-13 , 2011-10-10 , 2011-10-11 , 2011-10-16 , 2011-10-17 , 2011-10-14 , 2011-10-15 , 2011-10-18 , 2011-10-19 , 2012-01-17 , 2012-01-16 , 2012-01-15 , 2012-01-14 , 2012-01-13 , 2012-01-12 , 2012-01-11 , 2012-01-10 , 2012-01-19 , 2012-01-18 , 2011-11-06 , 2011-11-07 , 2011-11-04 , 2011-11-05 , 2011-11-02 , 2011-11-03 , 2011-10-07 , 2011-10-06 , 2011-10-09 , 2011-10-08 , 2011-11-08 , 2011-11-09 , 2011-12-30 , 2011-12-31 , 2011-11-15 , 2011-11-14 , 2011-11-17 , 2011-11-16 , 2011-10-30 , 2011-11-10 , 2011-11-13 , 2011-11-12 , 2011-11-19 , 2011-11-18 , 2012-01-29 , 2011-12-29 , 2011-12-28 , 2011-12-25 , 2011-12-24 , 2011-12-27 , 2011-12-26 , 2011-12-21 , 2011-12-20 , 2011-12-23 , 2011-12-22'

每天teets的数量

In [6]:
import pandas as pd

data = [[i, len(data_dict[i]), len(set(data_dict[i]))] for i in data_dict]
dat = pd.DataFrame(data, columns = ['date', 'tweets', 'users'])
dat.date = pd.to_datetime(dat.date)
dat = dat.sort(['date'])
C:\Users\abner\Anaconda2\lib\site-packages\ipykernel\__main__.py:6: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
In [7]:
import numpy as np
np.sum(dat.tweets)
Out[7]:
6602120L
In [8]:
dat.describe()
Out[8]:
tweets users
count 136.000000 136.000000
mean 48545.000000 16800.360294
std 51250.621113 14859.205183
min 4859.000000 2012.000000
25% 15803.750000 5945.500000
50% 37204.500000 13153.000000
75% 64630.250000 23297.250000
max 409075.000000 98934.000000
In [9]:
%matplotlib inline
import matplotlib.cm as cm
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(15, 4),facecolor='white')

plt.plot(dat.date, dat.tweets, 'r-o', label = "Tweets")
plt.plot(dat.date, dat.users, 'g-^', label = "Users")
plt.legend(loc=2,fontsize=8)
plt.yscale('log')
plt.show()
In [10]:
dat
Out[10]:
date tweets users
108 2011-10-06 49638 18487
107 2011-10-07 65238 23460
110 2011-10-08 65949 23243
109 2011-10-09 65097 22576
83 2011-10-10 78619 27499
84 2011-10-11 67596 22955
81 2011-10-12 67753 25535
82 2011-10-13 98954 39200
87 2011-10-14 148062 47666
88 2011-10-15 161802 48951
85 2011-10-16 165381 51602
86 2011-10-17 113628 41525
89 2011-10-18 103337 37394
90 2011-10-19 88874 34990
64 2011-10-20 73159 29254
63 2011-10-21 75256 27029
62 2011-10-22 70368 24213
61 2011-10-23 64808 22378
2 2011-10-24 64571 24268
1 2011-10-25 74641 27084
68 2011-10-26 141729 46424
65 2011-10-27 137172 43368
59 2011-10-28 87216 31215
56 2011-10-29 75741 25319
119 2011-10-30 77813 24979
47 2011-10-31 64256 25911
80 2011-11-01 62286 23962
105 2011-11-02 74554 26559
106 2011-11-03 92698 32405
103 2011-11-04 63529 24288
... ... ... ...
57 2012-01-20 19840 8133
58 2012-01-21 15459 5514
55 2012-01-22 11948 4502
0 2012-01-23 14896 5989
53 2012-01-24 14290 5360
54 2012-01-25 18812 8440
51 2012-01-26 15159 5938
52 2012-01-27 11992 4758
49 2012-01-28 11530 4243
125 2012-01-29 27219 8856
67 2012-01-30 27198 9038
66 2012-01-31 18682 7120
41 2012-02-01 16578 5958
43 2012-02-02 14995 5452
42 2012-02-03 14371 5087
38 2012-02-04 16591 5443
37 2012-02-05 13656 4603
40 2012-02-06 14446 5201
39 2012-02-07 14940 5244
45 2012-02-08 13451 5099
44 2012-02-09 13944 5156
19 2012-02-10 14083 4843
20 2012-02-11 12920 4616
17 2012-02-12 11178 4131
18 2012-02-13 13600 4799
15 2012-02-14 13580 4669
16 2012-02-15 13248 4895
13 2012-02-16 12837 4428
14 2012-02-17 12468 4299
21 2012-02-18 4859 2012

136 rows × 3 columns

提取某日数据集

ows_data=[] date='2011-10-13' for line in chunk[1:]: div_line = line.replace('\n', ' ') lines = div_line.split(',') if lines[3] == date: ows_data.append(line)

In [13]:
len(ows_data)
Out[13]:
0
In [14]:
with open("/Users/abner/github/data/ows_20111013.txt", 'w') as f:
    for i in ows_data:
        f.write(i + '\n')
In [15]:
#情感分析
In [16]:
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier
In [17]:
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      all_words.extend(words)
    return all_words
In [18]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features
In [19]:
def read_tweets(fname, t_type):
    tweets = []
    f = open(fname, 'r')
    line = f.readline()
    while line != '':
        tweets.append([line, t_type])
        line = f.readline()
    f.close()
    return tweets
In [20]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
      features['contains(%s)' % word] = (word in document_words)
    return features
In [21]:
def classify_tweet(tweet):
    return classifier.classify(extract_features(nltk.word_tokenize(tweet)))
In [22]:
# read in postive and negative training tweets
pos_tweets = read_tweets('/Users/abner/github/data/happy.txt', 'positive')
neg_tweets = read_tweets('/Users/abner/github/data/sad.txt', 'negative')
In [23]:
# 过滤词
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))
In [24]:
# extract the word features out from the training data
word_features = get_word_features(get_words_in_tweets(tweets))
In [25]:
# get the training set and train the Naive Bayes Classifier
training_set = nltk.classify.util.apply_features(extract_features, tweets)
classifier = NaiveBayesClassifier.train(training_set)
In [26]:
# to add your own test tweets, add them in the respective files
test_tweets = read_tweets('/Users/abner/github/data/happy_test.txt', 'positive')
test_tweets.extend(read_tweets('/Users/abner/github/data/sad_test.txt', 'negative'))
total = accuracy = float(len(test_tweets))
for i in test_tweets[:5]:
    print i[0]
I'm happy for him...really, I am. She's an amazing girl, and they deserve each other. He's happy & thats all that matters...right?.....

Feel so happy with no reason... Just happy... Hey my brain, am I missing something? :))

We finished our first season of @TheBEATDance & I am so happy & proud & thankful & overwhelmed & lots of other good stuff! So Amazing #2013

am i allowed to be happy about something, or do yo wanna distroy the little i have left?

I am so happy right now I can't even focus on anything else

In [27]:
for tweet in test_tweets:
    if classify_tweet(tweet[0]) != tweet[1]:
        accuracy -= 1
In [28]:
print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))
Total accuracy: 90.000000% (18/20).
In [29]:
tweet_emotion=[]
In [30]:
for tweet in blocks:
    tweet_emotion.append(classify_tweet(unicode(tweet, "utf8")))
print tweet_emotion[:5]
print len(tweet_emotion)
['positive', 'positive', 'negative', 'negative', 'positive']
8919
In [31]:
count1 = {}#定义字典,存放时间和每天的数量
for r in tweet_emotion:
    count1[r] = count1.get(r, 0) + 1
print(count1)
{'positive': 3068, 'negative': 5851}
In [32]:
import matplotlib.pyplot as plt
labels = 'positive', 'negative'
sizes = [3068, 5851]
colors = ['red', 'lightgreen']


plt.pie(sizes, labels=labels, colors=colors,shadow=True, startangle=90)
plt.show()