python

Wang Cheng-Jun edited this page Dec 19, 2016 · 1 revision

计算传播学是计算社会科学的重要分支。它主要关注人类传播行为的可计算性基础,以传播网络分析、传播文本挖掘、数据科学等为主要分析工具,(以非介入地方式)大规模地收集并分析人类传播行为数据,挖掘人类传播行为背后的模式和法则,分析模式背后的生成机制与基本原理,可以被广泛地应用于数据新闻和计算广告等场景,注重编程训练、数学建模、可计算思维。

Clone this wiki locally

https://github.com/solarlee/Awesome-Python-Toolbox

Table of Contents

Statistics

random

有时为了可以重复,我们需要这样设置:

import random
l = [1,2,3,4,5]
r = random.Random(500) # seed number is arbitrary 
  1. r.choice(l), r.choice(l)
random.Random(500).choice(l), random.Random(500).choice(l)

log bin

def log_binning(x, y, bin_count=50):
    max_x = np.log10(max(x))
    max_y = np.log10(max(y))
    max_base = max([max_x,max_y])
    xx = [i for i in x if i>0]
    min_x = np.log10(np.min(xx))
    bins = np.logspace(min_x,max_base,num=bin_count)
    # Based on: http://stackoverflow.com/questions/6163334/binning-data-in-python-with-scipy-numpy
    bin_means_y = (np.histogram(x,bins,weights=y)[0] / np.histogram(x,bins)[0])
    bin_means_x = (np.histogram(x,bins,weights=x)[0] / np.histogram(x,bins)[0])
    return bin_means_x,bin_means_y

直方图

一般而言使用hist命令即可,当我们想要先对数据取log,然后使用hist命令的时候,需要如下调整。

# The most direct way is to just compute the log10 of the limits, compute linearly spaced bins, and then convert back by raising to the power of 10, as below:
  1. http://stackoverflow.com/questions/6855710/how-to-have-logarithmic-bins-in-a-python-histogram
import pylab as pl import numpy as np data = np.random.normal(size=10000) MIN, MAX = .01, 10.0 pl.figure() pl.hist(data, bins = 10 ** np.linspace(np.log10(MIN), np.log10(MAX), 50)) pl.gca().set_xscale("log") pl.show()

Pandas

import pandas as pd

data = {'xi':range(1, 12), 'y':[3, 5, 9, 12, 18, 22, 28, 35, 49, 60,65]}
df = pd.DataFrame(data)


Pandas中的GroupBy操作效率非常高,可以使用.get_group()方法获得分组后的每组数据的内容,例如从7000万条通话记录中寻找其中14万个用户的基站序列,可以先对用户进行GroupBy,即可获取他们的基站序列。

In [16]: df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
In [17]: df3.groupby(['X']).get_group('A')
Out[17]: 
   X  Y
0  A  1
2  A  3
In [18]: df3.groupby(['X']).get_group('B')
Out[18]: 
   X  Y
1  B  4
3  B  2

  1. 获取每个用户的基站序列
dfgroupbyobj = call_data_selected.groupby('calling_nbr') node_list = dfgroupbyobj.get_group(i)['calling_cell'].tolist()

sorted

 sort a dict
webt = sorted(web.iteritems(), key=lambda (k,v): (-v,k))

sorted(data,key=lambda x:-x)

sorted(d.items(), key=lambda x: x[1])

sorted(data, reverse = True )
 how to return index of a sorted list?
>>> s = [2, 3, 1, 4, 5]
>>> sorted(range(len(s)), key=lambda k: s[k])
[2, 0, 1, 3, 4]
>>> 

Matplotlib

缩略图

http://file.allitebooks.com/20160505/Matplotlib%20for%20Python%20Developers.pdf

Matplotlib for Python Developers

强烈推荐此书

400px200px

Networkx

参见:https://networkx.github.io/documentation/latest/reference/

自定参数绘制网络

import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10,10))
pos=nx.spring_layout(G)
nx.draw_networkx_nodes(G,pos,node_size=20, node_color='b')
nx.draw_networkx_labels(G,pos,fontsize=5)
nx.draw_networkx_edges(G,pos,edge_color='k')
edge_labels = nx.get_edge_attributes(G,'times')
nx.draw_networkx_edge_labels(G, pos, labels = edge_labels, label_pos=0.5)
plt.show()

如图

300px

Text Mining

from jieba import cut
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

txt = ["我来到北京清华大学",
    "他来到了网易杭研大厦",
    "小明硕士毕业与中国科学院",
    "我爱北京天安门"]

corpus =[r' '.join(cut(i, cut_all=False))   for i in txt ]
vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
word=vectorizer.get_feature_names()#获取词袋模型中的所有词语
weight=tfidf.toarray()

for i in word:
    print i

weight

Web

WeRoBot 一个使用python编写的微信机器人框架

https://github.com/whtsky/WeRoBot

Network Science

Linear Fitting for Power Law

import powerlaw

def plotPowerlaw(data,ax,col,xlab):
    fit = powerlaw.Fit(data,xmin=1)
    fit.plot_pdf(color = col, linewidth = 2)
    fit = powerlaw.Fit(data)
    fit.power_law.plot_pdf(color = col, linestyle = 'dotted', ax = ax)
    a,x = (fit.power_law.alpha,fit.power_law.xmin)
    minx,maxx=ax.get_xlim()
    miny,maxy=ax.get_ylim()
    ax.text(minx*5,maxy/10,r"$\alpha = %d \:\:, x_{min} = %d$" % (a,x), fontsize = 20)
    ax.set_xlabel(xlab, fontsize = 20)
    ax.set_ylabel('$P(k)$', fontsize = 20)

def plotDegreeDistribution(G):
    # G is a networkx object
    degs = defaultdict(int)
    for i in G.degree().values(): degs[i]+=1
    items = sorted ( degs.items () )
    x, y = np.array(items).T
    x, y = np.array(items).T
    y_sum = np.sum(y)
    y = y/float(y_sum)
    plt.plot(x, y, 'b-o')
    plt.xscale('log')
    plt.yscale('log')
    plt.legend(['Degree'])
    plt.xlabel('$k$', fontsize = 20)
    plt.ylabel('$P(k)$', fontsize = 20)
import statsmodels.api as sm
import numpy as np
x = np.log(range(1,len(degree_list)+1))
y = np.log([d+1 for d in degree_list])
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
fig = plt.figure(figsize=(8, 4),facecolor='white')
plt.plot(range(len(degree_list)), degree_list, 'rs', label= 'Data')
plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit')
plt.yscale('log');plt.xscale('log')
plt.xlabel(r'$Users$', fontsize = 20)
plt.ylabel(r'$Replies$', fontsize = 20)
plt.text(max(range(len(degree_list)))/300,max(degree_list)/20,
         r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
plt.legend(loc=2,fontsize=10, numpoints=1)
plt.axis('tight')
plt.show()
print beta, r2

算得太慢怎么办

# 将实时结果记录到文件中
for k, i in enumerate(user_list):
    node_entropy = entropy(node_list)
    entropy_list.append(node_entropy)
    if k % 100 ==0:
        log_file = open('log_file.log', 'w+')
        log_file.write(str(k))
        log_file.write('\t')
        log_file.write(str(float(k)/nums))
        log_file.write('\t')
        log_file.write(str(node_entropy))
        log_file.write('\n')
        log_file.flush()

  1. 将实时结果和进度输出都屏幕上
def flushPrint(s): sys.stdout.write('\r') sys.stdout.write('%s' % s) sys.stdout.flush()