In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
In [2]:
import pandas as pd
In [5]:
df = pd.read_csv('/Users/sara/github/data/tianya_bbs_threads_list.txt',\
                 sep = "\t", names = ['title','link', \
                        'author','author_page',\
                        'click','reply','time'])
df[:2]
Out[5]:
title link author author_page click reply time
0 【民间语文第161期】宁波px启示:船进港湾人应上岸 /post-free-2849477-1.shtml 贾也 http://www.tianya.cn/50499450 194675 2703 2012-10-29 07:59
1 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) /post-free-2839539-1.shtml 无上卫士ABC http://www.tianya.cn/74341835 88244 1041 2012-10-24 12:41
In [6]:
# df=df.rename(columns = {0:'title', 1:'link', \
#                         2:'author',3:'author_page',\
#                         4:'click', 5:'reply', 6:'time'})
# df[:5]
In [8]:
da = pd.read_csv('/Users/sara/github/data/tianya_bbs_threads_author_info.txt', 
                 sep = "\t", names = ['author_page','followed_num',\
                        'fans_num','post_num', \
                        'comment_num'])
da[:2]
Out[8]:
author_page followed_num fans_num post_num comment_num
0 http://www.tianya.cn/50499450 152 27452 1020 1341
1 http://www.tianya.cn/74341835 0 1 2 5
In [9]:
# da=da.rename(columns = {0:'author_page', 1:'followed_num',\
#                         2:'fans_num',3:'post_num', \
#                         4:'comment_num'})
# # da[:5]
In [11]:
data = pd.concat([df,da], axis=1)
len(data)
Out[11]:
467
In [12]:
data[:3]
Out[12]:
title link author author_page click reply time author_page followed_num fans_num post_num comment_num
0 【民间语文第161期】宁波px启示:船进港湾人应上岸 /post-free-2849477-1.shtml 贾也 http://www.tianya.cn/50499450 194675 2703 2012-10-29 07:59 http://www.tianya.cn/50499450 152 27452 1020 1341
1 宁波镇海PX项目引发群体上访 当地政府发布说明(转载) /post-free-2839539-1.shtml 无上卫士ABC http://www.tianya.cn/74341835 88244 1041 2012-10-24 12:41 http://www.tianya.cn/74341835 0 1 2 5
2 宁波准备停止PX项目了,元芳,你怎么看? /post-free-2848797-1.shtml 牧阳光 http://www.tianya.cn/36535656 82779 625 2012-10-28 19:11 http://www.tianya.cn/36535656 19 28 816 1268
In [13]:
type(data.time[0])
Out[13]:
str
In [14]:
# extract date from datetime
date = map(lambda x: x[:10], data.time)
#date = [i[:10] for i in data.time]
data['date'] = pd.to_datetime(date)
In [15]:
# convert str to datetime format
data.time = pd.to_datetime(data.time)
data['month'] = data.time.dt.month
data['year'] = data.time.dt.year
data['day'] = data.time.dt.day
type(data.time[0])
Out[15]:
pandas.tslib.Timestamp
In [16]:
data.describe()
Out[16]:
click reply month year day
count 467.000000 467.000000 467.000000 467.000000 467.000000
mean 1534.957173 18.907923 7.432548 2012.620985 17.961456
std 11099.249834 144.869921 3.084860 1.795269 9.491730
min 11.000000 0.000000 1.000000 2006.000000 1.000000
25% 42.500000 0.000000 5.000000 2013.000000 8.000000
50% 84.000000 0.000000 6.000000 2013.000000 23.000000
75% 322.000000 4.000000 11.000000 2013.000000 25.000000
max 194675.000000 2703.000000 12.000000 2015.000000 31.000000
In [17]:
import statsmodels.api as sm
In [18]:
'   '.join(dir(sm))
Out[18]:
'GEE   GLM   GLS   GLSAR   Logit   MNLogit   MixedLM   NegativeBinomial   NominalGEE   OLS   OrdinalGEE   PHReg   Poisson   ProbPlot   Probit   QuantReg   RLM   WLS   __builtins__   __doc__   __file__   __name__   __package__   add_constant   categorical   cov_struct   datasets   distributions   emplike   families   formula   genmod   graphics   iolib   load   nonparametric   qqline   qqplot   qqplot_2samples   regression   robust   show_versions   stats   test   tools   tsa   version   webdoc'
In [19]:
'   '.join(dir(sm.stats))
Out[19]:
'CompareCox   CompareJ   CompareMeans   DescrStatsW   Describe   FTestAnovaPower   FTestPower   GofChisquarePower   HetGoldfeldQuandt   NormalIndPower   Runs   TTestIndPower   TTestPower   __builtins__   __doc__   __file__   __name__   __package__   acorr_breush_godfrey   acorr_ljungbox   anova_lm   binom_test   binom_test_reject_interval   binom_tost   binom_tost_reject_interval   breaks_cusumolsresid   breaks_hansen   chisquare_effectsize   cochrans_q   compare_cox   compare_j   corr_clipped   corr_nearest   cov_cluster   cov_cluster_2groups   cov_hac   cov_hc0   cov_hc1   cov_hc2   cov_hc3   cov_nearest   cov_nw_panel   cov_white_simple   diagnostic   durbin_watson   fdrcorrection   fdrcorrection_twostage   gof   gof_chisquare_discrete   het_arch   het_breushpagan   het_goldfeldquandt   het_white   jarque_bera   lillifors   linear_harvey_collier   linear_lm   linear_rainbow   mcnemar   moment_helpers   multicomp   multipletests   normal_ad   omni_normtest   power_binom_tost   power_ztost_prop   powerdiscrepancy   proportion_confint   proportion_effectsize   proportions_chisquare   proportions_chisquare_allpairs   proportions_chisquare_pairscontrol   proportions_ztest   proportions_ztost   recursive_olsresiduals   runstest_1samp   runstest_2samp   sandwich_covariance   se_cov   stattools   symmetry_bowker   tt_ind_solve_power   tt_solve_power   ttest_ind   ttost_ind   ttost_paired   tukeyhsd   unitroot_adf   zconfint   zt_ind_solve_power   ztest   ztost'
In [20]:
data.describe()
Out[20]:
click reply month year day
count 467.000000 467.000000 467.000000 467.000000 467.000000
mean 1534.957173 18.907923 7.432548 2012.620985 17.961456
std 11099.249834 144.869921 3.084860 1.795269 9.491730
min 11.000000 0.000000 1.000000 2006.000000 1.000000
25% 42.500000 0.000000 5.000000 2013.000000 8.000000
50% 84.000000 0.000000 6.000000 2013.000000 23.000000
75% 322.000000 4.000000 11.000000 2013.000000 25.000000
max 194675.000000 2703.000000 12.000000 2015.000000 31.000000
In [21]:
import numpy as np

np.mean(data.click), np.std(data.click), np.sum(data.click)
Out[21]:
(1534.9571734475376, 11087.35990002894, 716825)
In [22]:
# 不加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, \
                          weights=[1 for i in data.click])
d1.mean, d1.var, d1.std, d1.sum
Out[22]:
(1534.9571734475376, 122929549.55276974, 11087.35990002894, 716825.0)
In [23]:
# 加权的变量描述
d1 = sm.stats.DescrStatsW(data.click, weights=data.reply)
d1.mean, d1.var, d1.std, d1.sum
Out[23]:
(83335.963986409959, 6297145701.6868114, 79354.556905617035, 735856562.0)
In [24]:
np.median(data.click) # np.percentile
Out[24]:
84.0
In [25]:
plt.hist(data.click)
plt.show()
In [26]:
plt.hist(data.reply, color = 'green')
plt.show()
In [27]:
plt.hist(np.log(data.click+1), color='green')
plt.hist(np.log(data.reply+1), color='red')
plt.show()
In [28]:
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1)])
plt.show()
In [29]:
# Plot the height and weight to see
plt.boxplot([data.click, data.reply])
plt.show()
In [30]:
def transformData(dat):
    results = []
    for i in dat:
        if i != 'na':
            results.append( int(i))
        else:
            results.append(0)
    return results
In [32]:
data.fans_num = transformData(data.fans_num)
data.followed_num = transformData(data.followed_num )
data.post_num = transformData(data.post_num )
data.comment_num = transformData(data.comment_num )
data.describe()
Out[32]:
click reply followed_num fans_num post_num comment_num month year day
count 467.000000 467.000000 467.000000 467.000000 467.000000 467.000000 467.000000 467.000000 467.000000
mean 1534.957173 18.907923 15.713062 839.421842 146.336188 434.556745 7.432548 2012.620985 17.961456
std 11099.249834 144.869921 120.221465 7589.853870 577.441999 1989.458332 3.084860 1.795269 9.491730
min 11.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 2006.000000 1.000000
25% 42.500000 0.000000 0.000000 0.000000 4.000000 0.000000 5.000000 2013.000000 8.000000
50% 84.000000 0.000000 0.000000 1.000000 16.000000 9.000000 6.000000 2013.000000 23.000000
75% 322.000000 4.000000 1.000000 4.000000 84.000000 88.000000 11.000000 2013.000000 25.000000
max 194675.000000 2703.000000 1817.000000 108449.000000 10684.000000 24848.000000 12.000000 2015.000000 31.000000
In [33]:
import numpy as np
# Plot the height and weight to see
plt.boxplot([np.log(data.click+1), np.log(data.reply+1), 
             np.log(data.fans_num+1),\
             np.log(data.followed_num + 1)], 
            labels = ['$Click$', '$Reply$', '$Fans$',\
                      '$Followed$'])
plt.show()
In [34]:
fig = plt.figure(figsize=(12,4))
data.boxplot(return_type='dict')
plt.yscale('log')
plt.show()
In [35]:
'   '.join(dir(data)[200:])
Out[35]:
'_unpickle_frame_compat   _unpickle_matrix_compat   _update_inplace   _validate_dtype   _values   _where   _xs   abs   add   add_prefix   add_suffix   align   all   any   append   apply   applymap   as_blocks   as_matrix   asfreq   asof   assign   astype   at   at_time   author   author_page   axes   between_time   bfill   blocks   bool   boxplot   click   clip   clip_lower   clip_upper   columns   combine   combineAdd   combineMult   combine_first   comment_num   compound   consolidate   convert_objects   copy   corr   corrwith   count   cov   cummax   cummin   cumprod   cumsum   date   day   describe   diff   div   divide   dot   drop   drop_duplicates   dropna   dtypes   duplicated   empty   eq   equals   eval   ewm   expanding   fans_num   ffill   fillna   filter   first   first_valid_index   floordiv   followed_num   from_csv   from_dict   from_items   from_records   ftypes   ge   get   get_dtype_counts   get_ftype_counts   get_value   get_values   groupby   gt   head   hist   iat   icol   idxmax   idxmin   iget_value   iloc   index   info   insert   interpolate   irow   is_copy   isin   isnull   iteritems   iterkv   iterrows   itertuples   ix   join   keys   kurt   kurtosis   last   last_valid_index   le   link   loc   lookup   lt   mad   mask   max   mean   median   memory_usage   merge   min   mod   mode   month   mul   multiply   ndim   ne   nlargest   notnull   nsmallest   pct_change   pipe   pivot   pivot_table   plot   pop   post_num   pow   prod   product   quantile   query   radd   rank   rdiv   reindex   reindex_axis   reindex_like   rename   rename_axis   reorder_levels   replace   reply   resample   reset_index   rfloordiv   rmod   rmul   rolling   round   rpow   rsub   rtruediv   sample   select   select_dtypes   sem   set_axis   set_index   set_value   shape   shift   size   skew   slice_shift   sort   sort_index   sort_values   sortlevel   squeeze   stack   std   style   sub   subtract   sum   swapaxes   swaplevel   tail   take   time   title   to_clipboard   to_csv   to_dense   to_dict   to_excel   to_gbq   to_hdf   to_html   to_json   to_latex   to_msgpack   to_panel   to_period   to_pickle   to_records   to_sparse   to_sql   to_stata   to_string   to_timestamp   to_xarray   transpose   truediv   truncate   tshift   tz_convert   tz_localize   unstack   update   values   var   where   xs   year'
In [36]:
from pandas.tools import plotting

# fig = plt.figure(figsize=(10, 10))
plotting.scatter_matrix(data[['click', 'reply',\
                              'post_num','comment_num']]) 
plt.show()
In [37]:
'  '.join(dir(plotting))
Out[37]:
'AbstractMethodError  Appender  AreaPlot  BarPlot  BarhPlot  BasePlotMethods  BoxPlot  FramePlotMethods  HexBinPlot  HistPlot  Index  KdePlot  LinePlot  LooseVersion  MPLPlot  MultiIndex  PandasObject  PeriodIndex  PiePlot  PlanePlot  ScatterPlot  Series  SeriesPlotMethods  _Options  __builtins__  __doc__  __file__  __name__  __package__  _all_kinds  _common_kinds  _dataframe_kinds  _flatten  _gca  _gcf  _get_all_lines  _get_layout  _get_marker_compat  _get_standard_colors  _get_standard_kind  _get_xlim  _grouped_plot  _grouped_plot_by_column  _handle_shared_axes  _klasses  _mpl_ge_1_3_1  _mpl_ge_1_4_0  _mpl_ge_1_5_0  _mpl_ge_2_0_0  _mpl_le_1_2_1  _plot  _plot_klass  _remove_labels_from_axis  _series_kinds  _set_ticks_props  _shared_doc_df_kwargs  _shared_doc_kwargs  _shared_doc_series_kwargs  _shared_docs  _subplots  _try_sort  andrews_curves  autocorrelation_plot  bootstrap_plot  boxplot  boxplot_frame_groupby  cache_readonly  ceil  colors  compat  contextmanager  conv  cycler  deprecate_kwarg  df_ax  df_coord  df_kind  df_note  df_unique  division  format_date_labels  grouped_hist  hist_frame  hist_series  is_hashable  is_integer  is_iterator  is_list_like  is_number  isnull  klass  lag_plot  lmap  lrange  map  mpl_stylesheet  namedtuple  notnull  np  parallel_coordinates  plot_frame  plot_params  plot_series  pprint_thing  radviz  range  re  remove_na  scatter_matrix  scatter_plot  series_ax  series_coord  series_kind  series_note  series_unique  string_types  table  warnings  zip'
In [38]:
import seaborn # conda install seaborn
seaborn.pairplot(data, vars=['click', 'reply', \
                             'post_num', 'comment_num'],
                  kind='reg')
Out[38]:
<seaborn.axisgrid.PairGrid at 0x11282a650>
In [42]:
seaborn.lmplot(y='reply', x='click', data=data,
               size = 5)  
plt.show()
In [43]:
data.year.value_counts()
Out[43]:
2013    304
2014     63
2007     34
2012     33
2015     20
2011      6
2009      6
2006      1
Name: year, dtype: int64
In [44]:
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
dd
Out[44]:
year
2006 1
2007 34
2009 6
2011 6
2012 33
2013 304
2014 63
2015 20
In [45]:
dd.index
Out[45]:
Int64Index([2006, 2007, 2009, 2011, 2012, 2013, 2014, 2015], dtype='int64')
In [46]:
dd_date_str = map(lambda x: str(x) +'-01-01', dd.index)
dd_date_str
Out[46]:
['2006-01-01',
 '2007-01-01',
 '2009-01-01',
 '2011-01-01',
 '2012-01-01',
 '2013-01-01',
 '2014-01-01',
 '2015-01-01']
In [47]:
dd_date = pd.to_datetime(dd_date_str)
dd_date
Out[47]:
DatetimeIndex(['2006-01-01', '2007-01-01', '2009-01-01', '2011-01-01',
               '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01'],
              dtype='datetime64[ns]', freq=None)
In [48]:
plt.plot(dd_date, dd.year, 'r-o')
plt.show()
In [49]:
ds = dd.cumsum()
ds
Out[49]:
year
2006 1
2007 35
2009 41
2011 47
2012 80
2013 384
2014 447
2015 467
In [84]:
d = data.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
ds = dd.cumsum()

def getDate(dat):
    dat_date_str = map(lambda x: str(x) +'-01-01', dat.index)
    dat_date = pd.to_datetime(dat_date_str)
    return dat_date

ds.date = getDate(ds)
dd.date = getDate(dd)

plt.plot(ds_date, ds.year, 'g-s', label = '$Cumulative\: Number\:of\: Threads$')
plt.plot(dd_date, dd.year, 'r-o', label = '$Yearly\:Number\:of\:Threads$')
plt.legend(loc=2,numpoints=1,fontsize=13)
plt.show()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-84-319a44da2679> in <module>()
----> 1 d = data.year.value_counts()
      2 dd = pd.DataFrame(d)
      3 dd = dd.sort_index(axis=0, ascending=True)
      4 ds = dd.cumsum()
      5 

/Applications/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in __getattr__(self, name)
   2742             if name in self._info_axis:
   2743                 return self[name]
-> 2744             return object.__getattribute__(self, name)
   2745 
   2746     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'year'
In [51]:
dg = data.groupby('year').sum()
dg
Out[51]:
click reply followed_num fans_num post_num comment_num month day
year
2006 1214 24 0 2 278 291 8 24
2007 28290 514 22 137 8041 10344 281 512
2009 18644 186 17 12 531 571 39 78
2011 2889 28 84 28 332 661 50 72
2012 463720 5933 2779 59511 12315 32498 322 819
2013 63140 937 571 43265 24359 40362 2458 6111
2014 57764 772 2216 16664 11266 98025 233 579
2015 81164 436 1649 272391 11217 20186 80 193
In [54]:
dgs = dg.cumsum()
dgs
Out[54]:
click reply followed_num fans_num post_num comment_num month day
year
2006 1214 24 0 2 278 291 8 24
2007 29504 538 22 139 8319 10635 289 536
2009 48148 724 39 151 8850 11206 328 614
2011 51037 752 123 179 9182 11867 378 686
2012 514757 6685 2902 59690 21497 44365 700 1505
2013 577897 7622 3473 102955 45856 84727 3158 7616
2014 635661 8394 5689 119619 57122 182752 3391 8195
2015 716825 8830 7338 392010 68339 202938 3471 8388
In [55]:
def getDate(dat):
    dat_date_str = map(lambda x: str(x) +'-01-01', dat.index)
    dat_date = pd.to_datetime(dat_date_str)
    return dat_date

dg.date = getDate(dg)
In [56]:
fig = plt.figure(figsize=(12,5))
plt.plot(dg.date, dg.click, 'r-o', label = '$Yearly\:Number\:of\:Clicks$')
plt.plot(dg.date, dg.reply, 'g-s', label = '$Yearly\:Number\:of\:Replies$')
plt.plot(dg.date, dg.fans_num, 'b->', label = '$Yearly\:Number\:of\:Fans$')

plt.yscale('log')

plt.legend(loc=4,numpoints=1,fontsize=13)
plt.show()
In [57]:
data.groupby('year')['click'].sum()
Out[57]:
year
2006      1214
2007     28290
2009     18644
2011      2889
2012    463720
2013     63140
2014     57764
2015     81164
Name: click, dtype: int64
In [58]:
data.groupby('year')['click'].mean()
Out[58]:
year
2006     1214.000000
2007      832.058824
2009     3107.333333
2011      481.500000
2012    14052.121212
2013      207.697368
2014      916.888889
2015     4058.200000
Name: click, dtype: float64
In [59]:
repost = []
for i in df.title:
    if u'转载' in i.decode('utf8'):
        repost.append(1)
    else:
        repost.append(0)
In [60]:
df['repost'] = repost
In [61]:
df.groupby('repost').mean()
Out[61]:
click reply
repost
0 2884.381720 33.376344
1 641.743772 9.330961
In [62]:
df['click'][df['repost']==0][:5]
Out[62]:
0    194675
2     82779
3     45304
5     27026
6     24026
Name: click, dtype: int64
In [63]:
df['click'][df['repost']==1][:5]
Out[63]:
1     88244
4     38132
13     4990
16     3720
18     3421
Name: click, dtype: int64
In [64]:
from scipy import stats
stats.ttest_ind(data.click, data.repost)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-64-e42eb2d66fc0> in <module>()
      1 from scipy import stats
----> 2 stats.ttest_ind(data.click, data.repost)

/Applications/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in __getattr__(self, name)
   2742             if name in self._info_axis:
   2743                 return self[name]
-> 2744             return object.__getattribute__(self, name)
   2745 
   2746     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'repost'
In [65]:
sm.stats.ttest_ind(data.click, data.reply)
# test statistic, pvalue and degrees of freedom
Out[65]:
(2.9514887561591618, 0.0032417014839700789, 932.0)
In [66]:
from scipy.stats import chisquare
chisquare([16, 18, 16, 14, 12, 12], \
          f_exp=[16, 16, 16, 16, 16, 8])
Out[66]:
Power_divergenceResult(statistic=3.5, pvalue=0.62338762774958223)
In [67]:
from scipy.stats import chisqprob, chi2
# p_value = chi2.sf(chi_statistic, df)
print chisqprob(3.94,1), 1 - chi2.cdf(3.94,1)
0.0471507774946 0.0471507774946
In [68]:
print np.corrcoef(data.click, data.reply)

print np.corrcoef(np.log(data.click+1), \
                  np.log(data.reply+1))
[[ 1.          0.96396571]
 [ 0.96396571  1.        ]]
[[ 1.          0.77721397]
 [ 0.77721397  1.        ]]
In [69]:
data.corr()
Out[69]:
click reply followed_num fans_num post_num comment_num month year day
click 1.000000 0.963966 0.143595 0.158116 0.097502 0.085615 0.038788 -0.024827 0.048361
reply 0.963966 1.000000 0.199270 0.159387 0.090342 0.123341 0.040165 -0.041208 0.058738
followed_num 0.143595 0.199270 1.000000 0.407656 0.211677 0.499612 -0.036037 0.051187 -0.020604
fans_num 0.158116 0.159387 0.407656 1.000000 0.341724 0.145387 -0.084243 0.102301 -0.045883
post_num 0.097502 0.090342 0.211677 0.341724 1.000000 0.514695 -0.070024 -0.011786 -0.033254
comment_num 0.085615 0.123341 0.499612 0.145387 0.514695 1.000000 -0.118703 0.069160 -0.119840
month 0.038788 0.040165 -0.036037 -0.084243 -0.070024 -0.118703 1.000000 -0.236920 0.535354
year -0.024827 -0.041208 0.051187 0.102301 -0.011786 0.069160 -0.236920 1.000000 -0.046699
day 0.048361 0.058738 -0.020604 -0.045883 -0.033254 -0.119840 0.535354 -0.046699 1.000000
In [70]:
plt.plot(df.click, df.reply, 'r-o')
plt.show()
In [71]:
plt.plot(df.click, df.reply, 'gs')
plt.xlabel('$Clicks$', fontsize = 20)
plt.ylabel('$Replies$', fontsize = 20)
plt.xscale('log')
plt.yscale('log')
plt.title('$Allowmetric\,Law$', fontsize = 20)
plt.show()
In [72]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [73]:
# Load data
dat = sm.datasets.get_rdataset("Guerry", "HistData").data
# Fit regression model (using the natural log of one of the regressors)
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', \
                  data=dat).fit()
In [74]:
# Inspect the results
print results.summary()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                Lottery   R-squared:                       0.348
Model:                            OLS   Adj. R-squared:                  0.333
Method:                 Least Squares   F-statistic:                     22.20
Date:                Sun, 14 May 2017   Prob (F-statistic):           1.90e-08
Time:                        17:28:36   Log-Likelihood:                -379.82
No. Observations:                  86   AIC:                             765.6
Df Residuals:                      83   BIC:                             773.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------
Intercept         246.4341     35.233      6.995      0.000       176.358   316.510
Literacy           -0.4889      0.128     -3.832      0.000        -0.743    -0.235
np.log(Pop1831)   -31.3114      5.977     -5.239      0.000       -43.199   -19.424
==============================================================================
Omnibus:                        3.713   Durbin-Watson:                   2.019
Prob(Omnibus):                  0.156   Jarque-Bera (JB):                3.394
Skew:                          -0.487   Prob(JB):                        0.183
Kurtosis:                       3.003   Cond. No.                         702.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [75]:
reg = smf.ols('reply ~ click + followed_num', \
              data=data).fit()
In [76]:
print reg.summary()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  reply   R-squared:                       0.933
Model:                            OLS   Adj. R-squared:                  0.933
Method:                 Least Squares   F-statistic:                     3231.
Date:                Sun, 14 May 2017   Prob (F-statistic):          4.30e-273
Time:                        17:29:00   Log-Likelihood:                -2354.7
No. Observations:                 467   AIC:                             4715.
Df Residuals:                     464   BIC:                             4728.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept       -1.4024      1.766     -0.794      0.428        -4.873     2.068
click            0.0125      0.000     78.660      0.000         0.012     0.013
followed_num     0.0749      0.015      5.117      0.000         0.046     0.104
==============================================================================
Omnibus:                      374.515   Durbin-Watson:                   1.938
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            97373.297
Skew:                          -2.416   Prob(JB):                         0.00
Kurtosis:                      73.575   Cond. No.                     1.14e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.14e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [77]:
reg1 = smf.ols('np.log(reply+1) ~ np.log(click+1) \
+np.log(followed_num+1)+month', data=data).fit()
print reg1.summary()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:      np.log(reply + 1)   R-squared:                       0.606
Model:                            OLS   Adj. R-squared:                  0.603
Method:                 Least Squares   F-statistic:                     236.9
Date:                Sun, 14 May 2017   Prob (F-statistic):           4.03e-93
Time:                        17:29:07   Log-Likelihood:                -596.73
No. Observations:                 467   AIC:                             1201.
Df Residuals:                     463   BIC:                             1218.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
============================================================================================
                               coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------
Intercept                   -2.6009      0.189    -13.778      0.000        -2.972    -2.230
np.log(click + 1)            0.6872      0.029     24.083      0.000         0.631     0.743
np.log(followed_num + 1)     0.0118      0.034      0.347      0.729        -0.055     0.079
month                        0.0172      0.013      1.275      0.203        -0.009     0.044
==============================================================================
Omnibus:                       26.408   Durbin-Watson:                   1.904
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               44.572
Skew:                          -0.389   Prob(JB):                     2.10e-10
Kurtosis:                       4.299   Cond. No.                         44.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [78]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(reg1, fig = fig)
plt.show()
In [79]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

moore = sm.datasets.get_rdataset("Moore", "car",
                                 cache=True) # load data
data = moore.data
data = data.rename(columns={"partner.status" :
                             "partner_status"}) # make name pythonic
In [80]:
data[:5]
Out[80]:
partner_status conformity fcategory fscore
0 low 8 low 37
1 low 4 high 57
2 low 8 high 65
3 low 7 low 20
4 low 10 low 36
In [81]:
moore_lm = ols('conformity ~ C(fcategory, Sum)*C(partner_status, Sum)',
                 data=data).fit()
In [82]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(moore_lm, fig = fig)
plt.show()
In [83]:
table = sm.stats.anova_lm(moore_lm, typ=2) # Type 2 ANOVA DataFrame
print table
                                              sum_sq    df          F  \
C(fcategory, Sum)                          11.614700   2.0   0.276958   
C(partner_status, Sum)                    212.213778   1.0  10.120692   
C(fcategory, Sum):C(partner_status, Sum)  175.488928   2.0   4.184623   
Residual                                  817.763961  39.0        NaN   

                                            PR(>F)  
C(fcategory, Sum)                         0.759564  
C(partner_status, Sum)                    0.002874  
C(fcategory, Sum):C(partner_status, Sum)  0.022572  
Residual                                       NaN