Notebook purpose

- Determine appropriate model specifications

In [1]:
import os
import sys

import linearmodels as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import s3fs
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

sys.path.append("/Users/fgu/dev/projects/entropy")
import entropy.data.cleaners as cl
import entropy.data.creators as cr
import entropy.figures.figures as figs
import entropy.helpers.aws as ha
import entropy.helpers.data as hd
import entropy.helpers.helpers as hh
from entropy import config

sns.set_style("whitegrid")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
pd.set_option("max_colwidth", None)
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

fs = s3fs.S3FileSystem(profile="3di")

In [None]:
dfs, df = hd.read_samples(["777", "XX7"])
hd.inspect(df)

Time for read_sample    : 2.13 seconds


## Entropy - within vs between variation

In [201]:
def user_month_data(df):
    columns = ["user_id", "date", "entropy_sptac"]
    month = pd.Grouper(key="date", freq="M")
    return (
        df[columns]
        .groupby(["user_id", month])
        .first()
        .reset_index()
        .set_index(["user_id", "date"], drop=False)
    )

data = user_month_data(dfs)
data.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,date,entropy_sptac
user_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
777,2012-01-31,777,2012-01-31,2.871019
777,2012-02-29,777,2012-02-29,2.838295


Variation in entropy between users accounts for about 54 percent of total variation.

In [200]:
resPanelOLS = lm.PanelOLS.from_formula("entropy_sptac ~ C(user_id)", data=idata).fit()
print(lm.panel.compare([resPanelOLS], stars=True))

                   Model Comparison                  
                              Model 0         Model 1
-----------------------------------------------------
Dep. Variable           entropy_sptac   entropy_sptac
Estimator                   PooledOLS        PanelOLS
No. Observations                  788             788
Cov. Est.                  Unadjusted      Unadjusted
R-squared                      0.5425          0.5425
R-Squared (Within)             0.0000          0.0000
R-Squared (Between)            1.0000          1.0000
R-Squared (Overall)            0.5425          0.5425
F-statistic                    36.139          36.139
P-value (F-stat)               0.0000          0.0000
C(user_id)[777]             2.7645***       2.7645***
                             (165.49)        (165.49)
C(user_id)[58777]           2.7704***       2.7704***
                             (69.666)        (69.666)
C(user_id)[76777]           2.6736***       2.6736***
                            