# Statistics

## random

import random
l = [1,2,3,4,5]
r = random.Random(500) # seed number is arbitrary
r.choice(l), r.choice(l)
random.Random(500).choice(l), random.Random(500).choice(l)


## log bin

def log_binning(x, y, bin_count=50):
max_x = np.log10(max(x))
max_y = np.log10(max(y))
max_base = max([max_x,max_y])
xx = [i for i in x if i>0]
min_x = np.log10(np.min(xx))
bins = np.logspace(min_x,max_base,num=bin_count)
# Based on: http://stackoverflow.com/questions/6163334/binning-data-in-python-with-scipy-numpy
bin_means_y = (np.histogram(x,bins,weights=y)[0] / np.histogram(x,bins)[0])
bin_means_x = (np.histogram(x,bins,weights=x)[0] / np.histogram(x,bins)[0])
return bin_means_x,bin_means_y


## 直方图

# The most direct way is to just compute the log10 of the limits, compute linearly spaced bins, and then convert back by raising to the power of 10, as below:
http://stackoverflow.com/questions/6855710/how-to-have-logarithmic-bins-in-a-python-histogram

import pylab as pl
import numpy as np

data = np.random.normal(size=10000)

MIN, MAX = .01, 10.0

pl.figure()
pl.hist(data, bins = 10 ** np.linspace(np.log10(MIN), np.log10(MAX), 50))
pl.gca().set_xscale("log")
pl.show()



# Pandas

import pandas as pd

data = {'xi':range(1, 12), 'y':[3, 5, 9, 12, 18, 22, 28, 35, 49, 60,65]}
df = pd.DataFrame(data)



Pandas中的GroupBy操作效率非常高，可以使用.get_group()方法获得分组后的每组数据的内容，例如从7000万条通话记录中寻找其中14万个用户的基站序列，可以先对用户进行GroupBy，即可获取他们的基站序列。

In [16]: df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
In [17]: df3.groupby(['X']).get_group('A')
Out[17]:
X  Y
0  A  1
2  A  3
In [18]: df3.groupby(['X']).get_group('B')
Out[18]:
X  Y
1  B  4
3  B  2

dfgroupbyobj = call_data_selected.groupby('calling_nbr')
node_list = dfgroupbyobj.get_group(i)['calling_cell'].tolist()


## sorted

 sort a dict

webt = sorted(web.iteritems(), key=lambda (k,v): (-v,k))

sorted(data,key=lambda x:-x)

sorted(d.items(), key=lambda x: x[1])

sorted(data, reverse = True )

 how to return index of a sorted list?

>>> s = [2, 3, 1, 4, 5]
>>> sorted(range(len(s)), key=lambda k: s[k])
[2, 0, 1, 3, 4]
>>>


# Matplotlib

Matplotlib for Python Developers

# Networkx

### 自定参数绘制网络

import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10,10))
pos=nx.spring_layout(G)
nx.draw_networkx_nodes(G,pos,node_size=20, node_color='b')
nx.draw_networkx_labels(G,pos,fontsize=5)
nx.draw_networkx_edges(G,pos,edge_color='k')
edge_labels = nx.get_edge_attributes(G,'times')
nx.draw_networkx_edge_labels(G, pos, labels = edge_labels, label_pos=0.5)
plt.show()


# Text Mining

from jieba import cut
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

txt = ["我来到北京清华大学",
"他来到了网易杭研大厦",
"小明硕士毕业与中国科学院",
"我爱北京天安门"]

corpus =[r' '.join(cut(i, cut_all=False))   for i in txt ]
vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf，第二个fit_transform是将文本转为词频矩阵
word=vectorizer.get_feature_names()#获取词袋模型中的所有词语
weight=tfidf.toarray()

for i in word:
print i

weight


# Network Science

### Linear Fitting for Power Law

import powerlaw

def plotPowerlaw(data,ax,col,xlab):
fit = powerlaw.Fit(data,xmin=1)
fit.plot_pdf(color = col, linewidth = 2)
fit = powerlaw.Fit(data)
fit.power_law.plot_pdf(color = col, linestyle = 'dotted', ax = ax)
a,x = (fit.power_law.alpha,fit.power_law.xmin)
minx,maxx=ax.get_xlim()
miny,maxy=ax.get_ylim()
ax.text(minx*5,maxy/10,r"$\alpha = %d \:\:, x_{min} = %d$" % (a,x), fontsize = 20)
ax.set_xlabel(xlab, fontsize = 20)
ax.set_ylabel('$P(k)$', fontsize = 20)

def plotDegreeDistribution(G):
# G is a networkx object
degs = defaultdict(int)
for i in G.degree().values(): degs[i]+=1
items = sorted ( degs.items () )
x, y = np.array(items).T
x, y = np.array(items).T
y_sum = np.sum(y)
y = y/float(y_sum)
plt.plot(x, y, 'b-o')
plt.xscale('log')
plt.yscale('log')
plt.legend(['Degree'])
plt.xlabel('$k$', fontsize = 20)
plt.ylabel('$P(k)$', fontsize = 20)

import statsmodels.api as sm
import numpy as np
x = np.log(range(1,len(degree_list)+1))
y = np.log([d+1 for d in degree_list])
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
fig = plt.figure(figsize=(8, 4),facecolor='white')
plt.plot(range(len(degree_list)), degree_list, 'rs', label= 'Data')
plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit')
plt.yscale('log');plt.xscale('log')
plt.xlabel(r'$Users$', fontsize = 20)
plt.ylabel(r'$Replies$', fontsize = 20)
plt.text(max(range(len(degree_list)))/300,max(degree_list)/20,
r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
plt.legend(loc=2,fontsize=10, numpoints=1)
plt.axis('tight')
plt.show()
print beta, r2


# 算得太慢怎么办

# 将实时结果记录到文件中
for k, i in enumerate(user_list):
node_entropy = entropy(node_list)
entropy_list.append(node_entropy)
if k % 100 ==0:
log_file = open('log_file.log', 'w+')
log_file.write(str(k))
log_file.write('\t')
log_file.write(str(float(k)/nums))
log_file.write('\t')
log_file.write(str(node_entropy))
log_file.write('\n')
log_file.flush()

def flushPrint(s):
sys.stdout.write('\r')
sys.stdout.write('%s' % s)
sys.stdout.flush()