%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('/Users/xiha/github/cjc-gh-pages/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df[:2]
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:5]#重新进行header的命名
da = pd.read_csv('/Users/xiha/github/cjc-gh-pages/data/tianya_bbs_threads_author_info.txt', sep = "\t", header=None)
da[:2]
da=da.rename(columns = {0:'author_page', 1:'followed_num', 2:'fans_num',3:'post_num', 4:'comment_num'})
da[:5]
data = pd.concat([df,da], axis=1)#合并数据集
len(data)
type(data.time[0])#字符型的数据
# extract date from datetime
date = map(lambda x: x[:10], data.time)#map(函数:参数情况)这个lambda函数运行效率高,处理大数据的时候用!
data['date'] = pd.to_datetime(date)#把日期格式的字符串转换成日期类型的数据
# convert str to datetime format
data.time = pd.to_datetime(data.time)
data['month'] = data.time.dt.month
data['year'] = data.time.dt.year
data['day'] = data.time.dt.day
type(data.time[0])
data[:3]#显示前三个记录
data.describe()
import statsmodels.api as sm
' '.join(dir(sm.stats))
data.describe()
import numpy as np
np.mean(data.click), np.std(data.click), np.sum(data.click)
# 不加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, weights=[1 for i in data.click])
d1.mean, d1.var, d1.std, d1.sum
# 加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, weights=data.reply)
d1.mean, d1.var, d1.std, d1.sum
np.median(data.click)
plt.hist(data.click)
plt.show()
plt.hist(data.reply, color = 'purple')
plt.show()
plt.hist(np.log(data.click+1), color='green')
plt.hist(np.log(data.reply+1), color='purple')
plt.show()
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1)])
plt.show()
# Plot the height and weight to see
plt.boxplot([data.click, data.reply])
plt.show()
def transformData(dat):#对一个变量进行处理
results = []
for i in dat:
if i != 'na':
results.append( int(i))
else:
results.append(0)
return results
data.fans_num = transformData(data.fans_num)
data.followed_num = transformData(data.followed_num )
data.post_num = transformData(data.post_num )
data.comment_num = transformData(data.comment_num )
data.describe()
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1), np.log(data.reply+1),
np.log(data.fans_num+1), np.log(data.followed_num + 1)],
labels = ['$Click$', '$Reply$', '$Fans$', '$Followed$'])
plt.show()
fig = plt.figure(figsize=(12,4))
data.boxplot(return_type='dict')
plt.yscale('log')
plt.show()
from pandas.tools import plotting
#fig = plt.figure(figsize=(10, 10))
plotting.scatter_matrix(data[['click', 'reply', 'post_num','comment_num']])
plt.show()
import seaborn # conda install seaborn
seaborn.pairplot(data, vars=['click', 'reply', 'post_num', 'comment_num'],
kind='reg')
seaborn.pairplot(data, vars=['click', 'reply', 'post_num'],
kind='reg', hue='year')
seaborn.lmplot(y='reply', x='click', data=data)
data.year.value_counts()#每一年的贴子有
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
dd
dd.index
dd_date_str = map(lambda x: str(x) +'-01-01', dd.index)
dd_date_str
dd_date = pd.to_datetime(dd_date_str)#pd里有处理日期的函数
dd_date
plt.plot(dd_date, dd.year, 'r-o')
plt.show()
ds = dd.cumsum()
ds
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
ds = dd.cumsum()
def getDate(dat):
dat_date_str = map(lambda x: str(x) +'-01-01', dat.index)
dat_date = pd.to_datetime(dat_date_str)
return dat_date
ds_date = getDate(ds)
dd_date = getDate(dd)
plt.plot(ds_date, ds.year, 'g-s', label = '$Cumulative\: Number\:of\: Threads$')
plt.plot(dd_date, dd.year, 'r-o', label = '$Yearly\:Number\:of\:Threads$')
plt.legend(loc=2,numpoints=1,fontsize=13)
plt.show()
dg = data.groupby('year').sum()
dg
dgs = dg.cumsum()
dgs
def getDate(dat):
dat_date_str = map(lambda x: str(x) +'-01-01', dat.index)
dat_date = pd.to_datetime(dat_date_str)
return dat_date
dg.date = getDate(dg)
fig = plt.figure(figsize=(12,5))
plt.plot(dg.date, dg.click, 'r-o', label = '$Yearly\:Number\:of\:Clicks$')
plt.plot(dg.date, dg.reply, 'g-s', label = '$Yearly\:Number\:of\:Replies$')
plt.plot(dg.date, dg.fans_num, 'b->', label = '$Yearly\:Number\:of\:Fans$')
plt.yscale('log')
plt.legend(loc=4,numpoints=1,fontsize=13)
plt.show()
data.groupby('year')['click'].sum()
data.groupby('year')['click'].mean()
repost = []
for i in df.title:
if u'转载' in i.decode('utf8'):
repost.append(1)
else:
repost.append(0)
df['repost'] = repost
df.groupby('repost').sum()
df['click'][df['repost']==0][:5]
df['click'][df['repost']==1][:5]
from scipy import stats
stats.ttest_ind(df.click, df.repost)
sm.stats.ttest_ind(data.click, data.reply)
from scipy.stats import chisquare
chisquare([16, 18, 16, 14, 12, 12], f_exp=[16, 16, 16, 16, 16, 8])#远高于,所以不能拒绝原假设,原假设成立
from scipy.stats import chisqprob, chi2
# p_value = chi2.sf(chi_statistic, df)
print chisqprob(3.94,1), 1 - chi2.cdf(3.94,1)
import numpy as np
print np.corrcoef(data.click, data.reply) #计算任意两个变量的相关性!!!
print np.corrcoef(np.log(data.click+1), np.log(data.reply+1))
data.corr()
plt.plot(df.click, df.reply, 'r-o')#不取对数的情况下
plt.show()
plt.plot(df.click, df.reply, 'gs')#取对数之后
plt.xlabel('$Clicks$', fontsize = 20)
plt.ylabel('$Replies$', fontsize = 20)
plt.xscale('log')
plt.yscale('log')
plt.title('$Allowmetric\,Law$', fontsize = 20)
plt.show()
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
# Load data
dat = sm.datasets.get_rdataset("Guerry", "HistData").data
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()
#研究Lottery和Literacy + np.log(Pop1831的相关性
# Inspect the results
print results.summary()
reg = smf.ols('reply ~ click + followed_num', data=data).fit()#回复和点击和跟贴数量的关系
print reg.summary()
import statsmodels.api as sm
from statsmodels.formula.api import ols
moore = sm.datasets.get_rdataset("Moore", "car",
cache=True) # load data
data = moore.data
data = data.rename(columns={"partner.status" :
"partner_status"}) # make name pythonic
data[:5]
moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
data=data).fit()
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(moore_lm, fig = fig)
plt.show()
table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 ANOVA DataFrame
print table