In [None]:
# Use case: 
# when we are curious about how certain conditions (wordlists) would impact engagement mertrics on social posts,
# we use negative binomial model to measure the impacts of each condition we designed.

In [None]:
import pandas as pd
import statsmodels as sm
import os

In [None]:
# load the content that you want to analyze
df = pd.read_pickle('social_content/tw/Skittles_tw.pkl')

In [None]:
df.columns

In [None]:
df = df.rename(columns = {'full_text':'text','favorite_count':'engagement' })
df.columns

In [None]:
# feature 1: whether a caption involves recipe
df['feature_recipe_post'] = df['text'].str.contains('recipe').fillna(False)

In [None]:
# feature 2: whether a caption involves a question
df['feature_question'] = df['text'].str.contains('\?').fillna(False)

In [None]:
# feature 3: whether a caption contains emoji
import emoji
import re

emoji_re = re.compile(r"(?<!\w)(?:{})(?!\w)".format('|'.join([re.escape(x) for x in emoji.UNICODE_EMOJI])))

def extract_emojis(str):
    return ''.join(c for c in str if c in emoji.UNICODE_EMOJI)

def has_emoji(caption):
    return caption.str.contains(emoji_re).fillna(False)

df['feature_emoji'] = has_emoji(df['text'])

In [None]:
# feature 4: weather a caption contains words related to certain topics
words_dict = {}
for filename in os.listdir('skittle_wordlist/v1/'):
    keyname = filename.split('skittles_')[1]
    words_dict[keyname] = open('skittle_wordlist/v1/'+ filename).read().split(',')

In [None]:
for key in words_dict:
    words_dict[key] = '|'.join(list(words_dict[key]))
# still need to finish this loop

In [None]:
for wordlist, wordlist_regex in words_dict.items():
    feature_name = 'feature_'+wordlist
    df[feature_name] = df['text'].str.contains(wordlist_regex).fillna(False)

In [None]:
# summarize volume of posts fall under each feature categories
(df[ [c for c in df.columns if ('feature_' in c)] ]==True).sum()

In [None]:
len(df)

In [None]:
# model on the features that have significant volumne to evaluate which bring meaningful impact to the engagements.

from statsmodels.discrete.discrete_model import NegativeBinomial
from statsmodels.tools.tools import add_constant

X = (
        add_constant(
            df[
                ['feature_question',
                 'feature_moments_words',
                 'feature_marketing_and_shopping_words'
                ]
            ].applymap(int)
        )
)
y = df.engagement

In [None]:
model = NegativeBinomial(endog = y, 
                         exog= X,
                         loglike_method = 'nb2',
                        missing='drop')

fit = model.fit(method='bfgs')

fit.summary()

In [None]:
fit.conf_int().applymap(pd.np.exp)

In [None]:
a = fit.summary
sm.iolib.smpickle.save_pickle(a, 'skittles_tw_modelresult.pkl')