OWS tweets

Wang Cheng-Jun edited this page Dec 19, 2016 · 1 revision

计算传播学是计算社会科学的重要分支。它主要关注人类传播行为的可计算性基础,以传播网络分析、传播文本挖掘、数据科学等为主要分析工具,(以非介入地方式)大规模地收集并分析人类传播行为数据,挖掘人类传播行为背后的模式和法则,分析模式背后的生成机制与基本原理,可以被广泛地应用于数据新闻和计算广告等场景,注重编程训练、数学建模、可计算思维。

Clone this wiki locally

Table of Contents

Python script

import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()

数据清理

提取term每天出现次数

import re
import twitter_text #pip install twitter-text-py
import csv
import sys
 
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()
     
def extract_rt_user(tweet):
    rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
    try:
        rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @')
        return rt_user_name
    except IndexError, e:
        pass

def cleanTweet(tweet):
    replace symbols, return word list
    #print tweet
    tweet = tweet.decode('utf-8').strip() 
    rt_name = extract_rt_user(tweet)
    ex = twitter_text.Extractor(tweet)
    at_names = ex.extract_mentioned_screen_names()
    urls = ex.extract_urls()
    hashtags = ex.extract_hashtags()
    for ia in at_names:
        tweet = tweet.replace(ia, )
    for j in urls:
        tweet = tweet.replace(j, )
    tweet = tweet.replace('RT @', ).replace('@', ).replace('"', ).replace('#', )
    seps = ['(', ')', '!', ':', '.', '?', ',', '=', u'\xa0', '/', '\\', '\n', '-', '|', ';', u'&amp', '*', "'", '+']
    for s in seps:
        tweet = tweet.replace(s, ' ')
    tweet = tweet.split(' ')
    tweet = [t.lower() for t in tweet if t != ]
    return tweet

from collections import defaultdict
import csv

import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()

data_dict = defaultdict(lambda: defaultdict(int))
error_num = 0
line_num = 0
total_num = 0

bigfile = open('/Users/chengjun/百度云同步盘/Writing/OWS/ows-clean.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)
total_num += len(chunk)
while chunk:
    lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
    for i in lines:
        line_num +=1
        if line_num % 10000 ==0:
            flushPrint(line_num)
        try:
            date = i[3]
            tweet = i[1]
            if len(date) == 10:
                tweet = cleanTweet(tweet)
                for tt in tweet:
                    data_dict[tt][date] += 1
            else:
                error_num+=1
        except:
            pass
    chunk = bigfile.readlines(chunkSize)
print line_num, total_num, error_num

import json
with open('/Users/chengjun/百度云同步盘/Writing/OWS/term_vectors.json', 'w') as f:
    json.dump(data_dict, f)

提取at, rt, url, hashtag, user每天出现的次数

import re
import twitter_text #pip install twitter-text-py
import csv
import sys

def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()
    
def extract_rt_user(tweet):
    rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
    try:
        rt_user_name = rt_patterns.findall(tweet)[0][1].strip(' @')
        return rt_user_name
    except IndexError, e:
        pass

from collections import defaultdict
import csv

bigfile = open('/Users/chengjun/百度云同步盘/Writing/OWS/ows-clean.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)

  1. format: data[date][attribute] = int
at_dict = defaultdict(lambda: defaultdict(int)) rt_dict = defaultdict(lambda: defaultdict(int)) url_dict = defaultdict(lambda: defaultdict(int)) tag_dict = defaultdict(lambda: defaultdict(int)) user_dict = defaultdict(lambda: defaultdict(int)) error_num =0 line_num = 0 while chunk: lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"') for i in lines: line_num +=1 if line_num % 10000 ==0: flushPrint(line_num) try: date = i[3] user = i[8] # from_user_id tweet = i[1] ex = twitter_text.Extractor(tweet) at_names = ex.extract_mentioned_screen_names() urls = ex.extract_urls() hashtags = ex.extract_hashtags() rt_user = extract_rt_user(tweet) if len(date) == 10: if at_names: for at_name in at_names: at_dict[date][at_name]+=1 if rt_user: rt_dict[date][rt_user]+=1 if urls: for url in urls: url_dict[date][url]+=1 if hashtags: for tag in hashtags: tag_dict[date][tag]+=1 user_dict[date][user]+=1 else: error_num+=1 except Exception, e: print e pass chunk = bigfile.readlines(chunkSize) print line_num, error_num import json with open('/Users/chengjun/百度云同步盘/Writing/OWS/at_dict.json', 'w') as f: json.dump(at_dict, f) with open('/Users/chengjun/百度云同步盘/Writing/OWS/rt_dict.json', 'w') as f: json.dump(rt_dict, f) with open('/Users/chengjun/百度云同步盘/Writing/OWS/url_dict.json', 'w') as f: json.dump(url_dict, f) with open('/Users/chengjun/百度云同步盘/Writing/OWS/tag_dict.json', 'w') as f: json.dump(tag_dict, f) with open('/Users/chengjun/百度云同步盘/Writing/OWS/user_dict.json', 'w') as f: json.dump(user_dict, f)

800px

800px

计算每天的tweets和users数量

从tweets中定位date字符,然后读取出所有的date和users,并且计算出读取的行数、总的行数和出错的行数

from collections import defaultdict
import csv
 
data_dict = defaultdict(list)
 
error_num = 0
line_num = 0
total_num = 0
 
bigfile = open('D:/Data/ows/ows-raw.txt', 'rb')
chunkSize = 100000000
chunk = bigfile.readlines(chunkSize)
while chunk:
    total_num += len(chunk)
    lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
    for i in lines:
        line_num +=1
        try:
            date = i[3]
            if len(date) == 10:
                data_dict[date].append(i[8])
            else:
                error_num+=1
        except:
            pass
    chunk = bigfile.readlines(chunkSize)
print line_num, total_num, error_num

以“date-tweets-users”格式输出

import pandas as pd
 
data = [[i, len(data_dict[i]), len(set(data_dict[i]))] for i in data_dict]
dat = pd.DataFrame(data, columns = ['date', 'tweets', 'users'])
dat.date = pd.to_datetime(dat.date)
dat = dat.sort(['date', 'tweets', 'users'])
print dat

  1. 返回结果
date tweets users 108 2011-10-06 49638 18487 107 2011-10-07 65238 23460 110 2011-10-08 65949 23243 .. ... ... ... 13 2012-02-16 12837 4428 14 2012-02-17 12468 4299 21 2012-02-18 4859 2012 [136 rows x 3 columns]

绘制每天的tweets和users数量变化折线图

400px

对users和tweets数量进行拟合,结果显示符合幂律分布

400px

抓取出tweets中url

从tweets文件中抓取出url部分,并计算出读取的行数,文件总的行数,以及出错的行数。

from collections import defaultdict
import csv
import re
import sys
def flushPrint(s):
    sys.stdout.write('\r')
    sys.stdout.write('%s' % s)
    sys.stdout.flush()   
from collections import defaultdict
import csv

data_dict = defaultdict(list) # error
error_num = 0
line_num = 0
total_num = 0
 
bigfile = open('D:/Data/ows-raw.txt', 'rb')
chunkSize = 10000000
chunk = bigfile.readlines(chunkSize)
while chunk:
    total_num += len(chunk)
    lines = csv.reader((line.replace('\x00',) for line in chunk), delimiter=',', quotechar='"')
    for i in lines:
        line_num+=1
        if line_num%1000000==0:
            flushPrint(line_num)
        try:
            url_patterns=re.compile(r"http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+",re.IGNORECASE)
            urls=url_patterns.findall(i[1]) # error
            date=i[3]
            for ui in urls: # error
                if len(date)==10:
                    data_dict[date].append(ui)
        except Exception, e:
            print e
            error_num+=1
            pass
    chunk = bigfile.readlines(chunkSize) # error
print line_num, total_num,error_num

构建一个数据框,格式为date-urls-users

import pandas as pd

data=[[d,len(data_dict[d]),len(set(data_dict[d]))] for d in data_dict]
  1. data = [[i, len(data_dict[i]), len(set(data_dict[i]))] for i in data_dict]
dat = pd.DataFrame(data, columns = ['date','urls',"users"]) dat.date = pd.to_datetime(dat.date) dat = dat.sort(['date', 'urls',"users"]) print dat

绘制users和urls以日期为x轴的累积频数折线图,结果显示二者的波动正相关

import matplotlib.pyplot as plt
import matplotlib
% matplotlib inline
import numpy as np
%matplotlib inline
import matplotlib.cm as cm
import matplotlib.pyplot as plt
 
fig = plt.figure(figsize=(15, 4),facecolor='white')
plt.plot(dat.date, dat.urls, 'r-o', label = "urls")
plt.plot(dat.date, dat.users, 'g-o', label = "users")
plt.legend(loc=2,fontsize=8)
  1. plt.yscale('log')
plt.show()

400px

对每天的urls和users数量做拟合,看是否呈幂律分布,结果显示二者符合幂律分布

import statsmodels.api as sm
 
x = np.log(dat.users)
y = np.log(dat.urls)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
fig = plt.figure(figsize=(8, 4),facecolor='white')
plt.plot(dat.users, dat.urls, 'rs', label= 'Data')
plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit')
plt.yscale('log');plt.xscale('log')
plt.xlabel(r'$Users$')
plt.ylabel(r'$Urls$')
plt.text(max(dat.users)/4,max(dat.urls)/20,
         r'$\beta$ = ' + str(round(beta,2)) +'\t' + r'$R^2$ = ' + str(round(r2, 2)))
plt.legend(loc=2,fontsize=10, numpoints=1)
plt.axis('tight')
plt.show()

400px

BSTS

BSTS Code of Occupy Central

setwd("/Users/chengjun/bigdata/")
oc = read.csv("./occupycentral_wordfreq.csv", sep = ",",
              header = F, stringsAsFactors = F,  encoding= "utf-8")

oc15 = read.csv("./occupycentral_wordfreq_tfidf.csv", sep = ",",
              header = F, stringsAsFactors = F )

query = read.csv("./occupycentralgoogletrends.csv", sep = ",",
              header = T, stringsAsFactors = F,  encoding= "utf-8")

query = query[1:27,]
names(query)
data = data.frame( t(oc[,2:28]) )
data$queryf = log(query$fanti+1)
data$queryj = log(query$jianti+1)
data$querye = log(query$occupy.central+1)
queryb = c(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 67.1245791245791, 2.097643097643072,
                54.538720538720554, 2.097643097643072, 0.0, 0.0, 0.0,
                10.488215488215474, 0.0, 31.464646464646478, 0.0,
                10.488215488215417, 4.195286195286144, 31.464646464646421,
                199.27609427609428, 125.85858585858585, 12.585858585858489,
                2.097643097643072, 0.0, 0.0, 0.0)
data$queryb = log(queryb +1)

data$y = c(1.        ,   8.33333333,  31.74193548,  34.72413793,
           41.70833333,  50.06666667,  52.35483871,  33.80645161,
           43.46666667,  36.53333333,  29.36666667,  30.03225806,
           34.29032258,  31.21428571,  33.93548387,  34.96666667,
           60.16129032,  22.62068966,  59.06451613,  27.51612903,
           16.8       ,  55.93548387,  32.53333333,  77.29032258,
           70.35483871,  38.78571429,  39.89285714)

name =oc[,1]
cat(name)

require(zoo)

date = c('2013-01-01', '2013-02-01','2013-03-01','2013-04-01',
        '2013-05-01','2013-06-01', '2013-07-01','2013-08-01',
        '2013-09-01','2013-10-01','2013-11-01','2013-12-01',
        '2014-01-01','2014-02-01', '2014-03-01','2014-04-01',
        '2014-05-01','2014-06-01','2014-07-01','2014-08-01',
        '2014-09-01','2014-10-01','2014-11-01','2014-12-01',
        '2015-01-01','2015-02-01','2015-03-01')

date = as.Date(strptime(date, "%Y-%m-%d"))


dt = zoo(data, date)

  1. par(mfrow=c(3, 2))
plot(dt[,121:125], main = ) matplot(scale(dt[,121:125]), type = "l", lwd = 2, main = ,ylab = "value") legend(10, 3,c('Google Fanti','Google Jianti','Google English', "Baidu Jianti",'News'), col=1:5,lty=1:5,cex=1,ncol=1, lwd = 2) cor(dt[,121:125])
  1. clustering
mydata = data.frame( oc15[,2:28] ) name = oc15[,1] mydata <- scale(t(mydata)) # standardize variables class(mydata)
  1. Determine number of clusters
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) for (i in 2:30) wss[i] <- sum(kmeans(mydata, centers=i)$withinss) plot(1:30, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
  1. Ward Hierarchical Clustering
d <- dist(mydata, method = "euclidean") # distance matrix fit <- hclust(d, method="ward.D") plot(fit) # display dendogram groups <- cutree(fit, k=5) # cut tree into 5 clusters
  1. draw dendogram with red borders around the 5 clusters
rect.hclust(fit, k=5, border="red") printGroup = function(n){ for (i in which(groups ==n)){ cat(name[i], ' , ') } } printGroup(1) for (i in 1:15){ cat(i, name[i], sep = "-->") }
  1. 反对派 , 和平 , 警方 , 团体 , 行政长官 , 意见 , 学生 , 大学 , 运动 , 国家 ,
  2. 内地 , 违法 , 选举 , 委会 , 提名 , 政策 , 人民 , 批评 , 事件 , 中国 , 本港 ,
  3. 梁振英 , 游行 , 戴耀廷 , 法治 , 组织 , 利益 , 自由 , 我们 , 委员会 , 争取 ,
  4. 法律 , 记者 , 示威 , 个人 , 活动 , 发起人 , 特区政府 , 基本法 , 国际 , 经济 , 传媒 , 民意 ,
printGroup(2)
  1. 立法会 , 方案 , 港人 , 公民 , 会议 , 特首 , 普选 , 占领 , 行动 , 政治 , 民主 , 行政 ,
  2. 代表 , 政改 , 反对 , 中环 , 党 , 政府 , 主席 , 社会 , 中央 , 市民 , 议员 ,
printGroup(3)
  1. 中央政府 , 抗命 , 官员 , 激进 , 安全 , 港独 , 制度 , 年轻人 , 民主派 , 程序 ,
  2. 政党 , 台湾 , 美国 , 生活 , 商讨 , 公众 , 建制 , 教育 , 抗争 , 政制 , 共识 ,
  3. 繁荣 , 青年 , 投资 , 集会 , 诉求 , 建议 , 规定 , 地方 , 学者 , 理性 , 市场 ,
  4. 民主党 , 台独 , 暴力 , 言论 , 泛民 , 认同 , 全国人大常委会 , 北京 , 尊重 ,
  5. 质疑 , 英国 , 公司 , 一国两制 ,
  6. 历史 , 投票 , 示威者 , 候选人 , 爱港 , 对抗 , 合作 , 爱国 , 调查 , 公投 ,
printGroup(4)
  1. 香港 , 占 ,
  1. bsts of news
library(bsts)
  1. dt$poll=(lowess(dt$poll, f = .03))$y
plot(date, dt$y, 'l', col='red') ss0 <- AddLocalLevel(list(), dt$y) ss1 <- AddSeasonal(ss0, y = dt$y, nseasons = 9, season.duration = 3) #27 trend.model <- bsts(dt$y, ss0, niter = 1000, bma.method = c("ODA"),seed = 1) trend.seasonal.model <- bsts(dt$y, ss1, niter = 1000, bma.method = c("ODA"), seed = 1) model <- bsts(y ~ ., data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model1<- bsts(y ~dt[,120], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model2<- bsts(y ~dt[,120]+dt[,115], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model3<- bsts(y ~dt[,120]+dt[,115]+dt[,113], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model4<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model5<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110]+dt[,109], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1) model6<- bsts(y ~dt[,120]+dt[,115]+dt[,113]+dt[,110]+dt[,109]+dt[,107], data = dt, niter = 1000, state.specification=ss0, expected.model.size = 3, bma.method = c("ODA"), seed = 1)
  1. > print_names(var_names)
120, 民意, 115, 调查, 113, 合作, 110, 市民, 109, 基本法, 107, 特区政府, 103, 活动, 102, 示威者, 100, 示威, 99, 社会, 98, 记者, 94, 一国两制, 92, 法律, 91, 争取, 89, 英国, 87, 党, 86, 尊重, 82, 全国人大常委会, 80, 反对, 76, 组织, 74, 暴力, 71, 戴耀廷, 68, 行政, 64, 梁振英, 60, 占, 57, 事件, 52, 政策, 49, 诉求, 48, 集会, 45, 青年, 41, 政制, 39, 占领, 37, 普选, 35, 内地, 33, 建制, 29, 运动, 25, 特首, 24, 学生, 20, 行政长官, 18, 团体, 14, 年轻人, 13, 制度, 11, 警方, 10, 安全, 8, 港人, 6, 激进, 5, 官员, 2, 立法会, 1, 反对派, 65, 理性, CompareBstsModels(list(trend = trend.model, #"trend and seasonal" = trend.seasonal.model, "model1"=model1, "model2"=model2, "model3"=model3, "model4"=model4, "model5"=model5, "model6"=model6, "all"=model), xlab = "") scope = c(min(dt$y), max(dt$y)) r.square = function(model) as.character(round(summary(model)$relative.gof, 3)) par(mfrow=c(4, 2)) par(mar=c(rep(3, 4))) plot(trend.model, ylim = scope, main = paste('Trend (relative.gof = ', r.square(trend.model), ')') , xlab = "", ylab = "Poll")
  1. plot(trend.seasonal.model, ylim = scope, main = paste('add Seasonal (relative.gof = ', r.square(trend.seasonal.model), ')'), xlab = "", ylab = "Poll")
plot(model1, ylim = scope, main = paste('model1 (relative.gof = ', r.square(model1), ')'), xlab = "", ylab = "Poll") plot(model2, ylim = scope, main = paste('model2 (relative.gof = ', r.square(model2), ')'), xlab = "", ylab = "Poll") plot(model3, ylim = scope, main = paste('model3 (relative.gof = ', r.square(model3), ')'), xlab = "", ylab = "Poll") plot(model4, ylim = scope, main = paste('model4 (relative.gof = ', r.square(model4), ')'), xlab = "", ylab = "Poll") plot(model5, ylim = scope, main = paste('model5 (relative.gof = ', r.square(model5), ')'), xlab = "", ylab = "Poll") plot(model6, ylim = scope, main = paste('model6 (relative.gof = ', r.square(model6), ')'), xlab = "", ylab = "Poll") plot(model, ylim = scope, main = paste('all (relative.gof = ', r.square(model), ')'), xlab = "", ylab = "Poll")
  1. pred <- predict(model, horizon = 12, burn = 100)
  2. plot(pred)
model <- bsts(y ~dt[,274]+dt[,272], data = dt, niter = 1000, state.specification=ss1, expected.model.size = 3, bma.method = c("ODA"), seed = 1) summary(model) par(mfrow=c(1,1)) plot(model)
  1. abline(h=0, col = 'red')
plot(model, "components") plot(model, "coefficients") plot(model, "size") plot(model, "predictors") plot(model1, 'state') plot(trend.model, 'state') print_names = function(var_names){ for (i in var_names){ i =strsplit(i, "X")[[1]][2] i = as.numeric(i) #cat(i, "",sep = ",") cat(i, name[i], " ",sep = "-->") } } var_names = rownames(summary(model)$coefficients)[1:50] print_names(var_names) 120,115,113,110,109,107,103,102,100,99,98,94,92,91,89,87,86, 82,80,76,74,71,68,64,60,57,52,49, 48,45,41,39,37,35,33,29,25,24,20,18,14,13,11,10,8,6,5,2,1,65, mydata = dt[,var_names] 120-->民意--> 115-->调查--> 113-->合作--> 110-->市民--> 109-->基本法--> 107-->特区政府--> 103-->活动--> 102-->示威者--> 100-->示威--> 99-->社会--> 98-->记者--> 94-->一国两制--> 92-->法律--> 91-->争取--> 89-->英国--> 87-->党--> 86-->尊重--> 82-->全国人大常委会--> 80-->反对--> 76-->组织--> 74-->暴力--> 71-->戴耀廷--> 68-->行政--> 64-->梁振英--> 60-->占--> 57-->事件--> 52-->政策--> 49-->诉求--> 48-->集会--> 45-->青年--> 41-->政制--> 39-->占领--> 37-->普选--> 35-->内地--> 33-->建制--> 29-->运动--> 25-->特首--> 24-->学生--> 20-->行政长官--> 18-->团体--> 14-->年轻人--> 13-->制度--> 11-->警方--> 10-->安全--> 8-->港人--> 6-->激进--> 5-->官员--> 2-->立法会--> 1-->反对派--> 65-->理性-->

From zipf to allowmetric

200px 200px 200px

Distinct hashtags over time

400px400px

左图是所有用户中的一部分,右图是活跃用户(推特数量大于50条)的一部分。基本证实 <math> S(t) \sim t^\mu</math>。参见Human Mobility

800px

参考文献

github 地址:https://github.com/qinqiang2015/tweets/blob/master/users_urls_at_name_rt_name.ipynb

王成军、党明辉、顾慧君, 参与者、议题与行动 香港_占领中环_运动中的新闻报道, 2015,计算传播学实验中心手稿http://computational-communication.com/wiki/images/b/b7/20160113%E3%80%8A%E5%8F%82%E4%B8%8E%E8%80%85%E3%80%81%E8%AE%AE%E9%A2%98%E4%B8%8E%E8%A1%8C%E5%8A%A8_%E9%A6%99%E6%B8%AF_%E5%8D%A0%E9%A2%86%E4%B8%AD%E7%8E%AF_%E8%BF%90%E5%8A%A8%E4%B8%AD%E7%9A%84%E6%96%B0%E9%97%BB%E6%8A%A5%E9%81%93%E3%80%8B.pdf 论文链接