In [1]:
%matplotlib inline

from IPython.display import display, HTML
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from tqdm import tqdm_notebook as tqdm

from collections import Counter
import operator
import sys

In [2]:
def normalize(series):
    return sklearn.preprocessing.quantile_transform(
            series.values.reshape(-1, 1), copy=True)

In [3]:
authors_df = pd.read_csv("../data/processed/author-label-state.csv.bz2", index_col=0)

authors_df = authors_df.merge(pd.read_csv("../data/processed/states-extra.csv"), how='left')

authors_df.columns = [c.replace(' ', '') for c in authors_df.columns]
len(authors_df)

121046

In [4]:
Counter(authors_df.label)

Counter({'R': 110806, 'D': 10240})

In [5]:
author2label = dict(authors_df[['author', 'label']].values)

In [6]:
graph_df = pd.read_csv(
    "../data/processed/parent_child_sentiment_edges_politics.csv.bz2", index_col=0)
graph_df.head()

Unnamed: 0,parent,child,sentiment
0,fae83c5c68bc706c,441290880bbb7490,0.0
1,ffe92102186d8670,e3f88b3430d253df,0.2732
2,d4f08edfcca59643,35514c4898239466,0.0
3,be5f3b8eb7b8f85c,370cf738e4f3b228,0.7436
4,be5f3b8eb7b8f85c,7b5fc1d8ece515e5,0.3182


In [7]:
graph_df['lparent'] = graph_df.parent.apply(author2label.get)
graph_df['lchild'] = graph_df.child.apply(author2label.get)

In [8]:
authors = set(graph_df.parent) | set(graph_df.child)
len(authors), len(authors) ** 2

(31218, 974563524)

In [9]:
edges = list(map(tuple, graph_df[['parent', 'child']].values))
len(edges)

716765

In [10]:
Counter([author2label[c] for p, c in edges])

Counter({'R': 469713, 'D': 247052})

In [11]:
#edge-based sampling

np.random.seed(123)
heads, tails = np.array(edges).T
edges_set = set(edges)
non_edges = list()
num_non_edges = len(edges)
pbar = tqdm(total=num_non_edges)
while len(non_edges) < num_non_edges:
    sample_size = min(2 ** 17, num_non_edges - len(non_edges))
    head_samples = np.random.choice(heads, sample_size)
    tail_samples = np.random.choice(tails, sample_size)
    sample = map(tuple, np.array([head_samples, tail_samples]).T)
    new_non_edges = [e for e in sample if e not in edges_set]
    non_edges += new_non_edges
    pbar.update(len(new_non_edges))
pbar.close()
del edges_set
len(non_edges)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/716765 [00:00<?, ?it/s]

716765

In [12]:
Counter([author2label[c] for p, c in non_edges])

Counter({'R': 476584, 'D': 240181})

In [13]:
matrix = np.array([
    (
        parent,
        child,
        int(author2label[child] == 'D'),
        int(author2label[parent] != author2label[child]),
        int(y)
    )
    for y, examples in [(True, edges), (False, non_edges)]
    for parent, child in tqdm(examples, desc=f"Evaluating {y} examples")
])

np.random.seed(123)
balanced_examples = pd.DataFrame(np.random.permutation(matrix), columns=[
    "parent", "child", "dem_author", "diff_label", "is_link"
])

balanced_examples['dem_author'] = balanced_examples.dem_author.astype(np.int64, copy=False)
balanced_examples['diff_label'] = balanced_examples.diff_label.astype(np.int64, copy=False)
balanced_examples['is_link'] = balanced_examples.is_link.astype(np.int64, copy=False)

examples = balanced_examples


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


Evaluating True examples:   0%|          | 0/716765 [00:00<?, ?it/s]

Evaluating False examples:   0%|          | 0/716765 [00:00<?, ?it/s]

In [14]:
examples[['dem_author', 'diff_label', 'is_link']].mean()

dem_author    0.339883
diff_label    0.470527
is_link       0.500000
dtype: float64

In [15]:
authors_df['clintonShare'] = authors_df.clintonVotes / authors_df.totalVotes
authors_df['trumpShare'] = authors_df.trumpVotes / authors_df.totalVotes
authors_df['nonvote'] = 1. - (authors_df.trumpVotes + authors_df.clintonVotes) / authors_df.totalVotes

In [16]:
authors_df['GiniCoefficient_q'] = normalize(authors_df['GiniCoefficient'])
authors_df['income_q'] = normalize(authors_df['income'])
authors_df['clintonShare_q'] = normalize(authors_df['clintonShare'])
authors_df['trumpShare_q'] = normalize(authors_df['trumpShare'])
authors_df['nonvote_q'] = normalize(authors_df['nonvote'])
authors_df['with_diploma_q'] = normalize(authors_df['with_diploma'])
authors_df['unemployment2016_q'] = normalize(authors_df['unemployment2016'])

In [17]:
scores = pd.read_csv(
        "../data/processed/scores-of-authors-in-politics-from-author-label-state.csv.bz2")

scores['frac_pos'] = scores.num_comm_pos / scores.num_comm

scores['num_comm_q'] = normalize(scores['num_comm'])
scores['avg_score_q'] = normalize(scores['avg_score'])
scores['frac_pos_q'] = normalize(scores['frac_pos'])

In [18]:
dataset = (examples
           .merge(authors_df, left_on='child', right_on='author', how='left')
           .drop(columns=['author'])
           .merge(authors_df[['author', 'state']].rename(columns={'state': 'state_parent'}),
                  left_on='parent', right_on='author', how='left')
           .drop(columns=['author'])
           .merge(scores[['author', 'num_comm_q', 'avg_score_q', 'frac_pos_q']],
               left_on='child', right_on='author', how='left')
           .merge(scores[['author', 'num_comm_q', 'avg_score_q', 'frac_pos_q']],
               left_on='parent', right_on='author', how='left', suffixes=['_c', '_p'])
           .drop(columns=['author_c', 'author_p', 'parent', 'child', 'label'])
)

dataset['same_state'] = (dataset.state == dataset.state_parent).astype(np.float64)
dataset = dataset.drop(columns=['state', 'state_parent'])

In [19]:
dataset['diff_avg_score_q'] = np.abs(dataset['avg_score_q_c'] - dataset['avg_score_q_p'])
dataset['diff_frac_pos_q'] = np.abs(dataset['frac_pos_q_c'] - dataset['frac_pos_q_p'])
dataset['diff_num_comm_q'] = np.abs(dataset['num_comm_q_c'] - dataset['num_comm_q_p'])

In [20]:

pvalue2asterisk = lambda x: '***' if x < 0.001 else '*' if x < 0.05 else ''
    
def add_results_to_table(res, df=None):
#     main_stat = pd.Series('{:.5f}'.format(res.prsquared), index=['Pseudo-$R^2$'])
    main_stat = pd.Series([res.aic], index=['AIC'])
    logits = np.exp(res.params)
#     logits = res.tvalues # Which actually is Z values
    r = pd.concat([res.pvalues, logits], axis=1)
    new_row = r.transpose().apply(lambda x: '{:.3f}'.format(x[1]) + pvalue2asterisk(x[0]))
    new_row = pd.concat([main_stat, new_row])
    if df is None:
        return pd.DataFrame([new_row])
    else:
        return df.append(new_row, ignore_index=True)

In [21]:
base_formula = "is_link ~ dem_author + diff_label + dem_author*diff_label"
res = smf.logit(formula=base_formula,
               data=dataset.astype(np.float64)).fit()

df = add_results_to_table(res)

Optimization terminated successfully.
         Current function value: 0.691976
         Iterations 3


In [22]:
coef_int, coef_dem, coef_diff, coef_dem_diff = res.params.values
matrix_index = pd.MultiIndex.from_tuples(
    [('C', 'C'), ('C', 'T'), ('T', 'C'), ('T', 'T')],
    names=['child', 'parent'])
base_matrix = np.exp(pd.DataFrame(index=matrix_index, columns=['Odds ratio'], data=[
    coef_int + coef_dem,
    coef_int + coef_dem + coef_diff + coef_dem_diff,
    coef_int + coef_diff,
    coef_int
]))
base_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,Odds ratio
child,parent,Unnamed: 2_level_1
C,C,0.871095
C,T,1.107465
T,C,1.105292
T,T,0.924984


In [23]:
with open("../logit-matrix.tex", "w") as f:
    f.write(
r"""\bordermatrix{ & T & C \cr
      T & %.3f & %.3f \cr
      C & %.3f & %.3f  } 
""" % (base_matrix.loc[('T', 'T')], base_matrix.loc[('T', 'C')],
       base_matrix.loc[('C', 'T')], base_matrix.loc[('C', 'C')])    
)

In [24]:
other_reddit_variables = ["avg_score_q", "frac_pos_q"]
for variable in tqdm(other_reddit_variables):
    res = smf.logit(formula=base_formula + f" + {variable}_c + {variable}_p + diff_{variable}",
                   data=dataset.astype(np.float64)).fit()
#     display(res.summary())
    df = add_results_to_table(res, df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/2 [00:00<?, ?it/s]

Optimization terminated successfully.
         Current function value: 0.691734
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.691213
         Iterations 4


In [25]:
full_reddit_variables = "".join([f" + {variable}_c + {variable}_p + diff_{variable}"
                           for variable in other_reddit_variables])

In [26]:
res = smf.logit(formula=base_formula + full_reddit_variables,
               data=dataset.astype(np.float64)).fit()
# display(res.summary())
df = add_results_to_table(res, df)

Optimization terminated successfully.
         Current function value: 0.690939
         Iterations 4


In [27]:
variable_rename = {
    'dem_author': 'Clinton sup.',
    'diff_label': 'Cross-group',
    'avg_score_q': 'Avg. score',
    'frac_pos_q': 'Frac. positive',
    'num_comm_q': 'No. comments',
    'same_state': 'Same state',
    'swing': 'Swing state',
    'clintonShare_q': 'Clinton share',
    'trumpShare_q': 'Trump share',
    'nonvote_q': 'Non-vote share',
    'unemployment2016_q': 'Unemployment',
    'GiniCoefficient_q': 'Gini Coefficient',
    'income_q': 'Median Income',
    'with_diploma_q': 'High school'
}

def rename(x):
    if ':' in x:
        return ', '.join(map(rename, x.split(':')))
    if x in variable_rename:
        return variable_rename[x]
    if x[:-2] in variable_rename:
        return variable_rename[x[:-2]] + (" (author)" if x[-2:] == '_c' else " (target)")
    if x[:5] == 'diff_':
        return "Diff. " + rename(x[5:]).lower()
    return x


In [28]:
base_AIC = df.AIC[0]

In [29]:
df['$\Delta$ AIC'] = base_AIC - df.AIC

In [30]:
table1 = df[df.columns[2:]].rename(columns={k: rename(k) for k in df.columns}).transpose().fillna('')
table1

Unnamed: 0,0,1,2,3
Clinton sup.,0.942***,0.918***,0.936***,0.911***
Cross-group,1.195***,1.172***,1.191***,1.165***
"Clinton sup., Cross-group",1.064***,1.091***,1.070***,1.102***
Avg. score (author),,1.166***,,1.174***
Avg. score (target),,1.151***,,1.167***
Diff. avg. score,,1.213***,,1.228***
Diff. frac. positive,,,0.497***,0.498***
Frac. positive (author),,,1.260***,1.247***
Frac. positive (target),,,1.221***,1.195***
$\Delta$ AIC,0,686.799,2179.77,2960.21


In [31]:
with open("../logit-table-reddit.tex", "w") as f:
    latex = table1.to_latex(header=False, escape=False).replace(r"\toprule", r"""
\toprule
Variable name &  \multicolumn{%d}{c}{Odds ratios. * $p < 0.05$, *** $p < 0.001$} \\
\midrule
    """ % len(table1.columns)).replace("$\Delta$", r"\midrule $\Delta$")
    f.write(latex)

In [32]:
real_variables = [
     'same_state',
     'swing',
     'clintonShare_q',
     'trumpShare_q',
     'nonvote_q',
     'unemployment2016_q',
     'GiniCoefficient_q',
     'income_q',
     'with_diploma_q',
]

df = None
for variable in tqdm(real_variables):
    res = smf.logit(formula=(base_formula + full_reddit_variables +
                    f" + {variable} + {variable}*diff_label"),
               data=dataset.astype(np.float64)).fit()
    if variable == 'same_state':
        df = add_results_to_table(res, df)
    else:
        res = smf.logit(formula=(base_formula + full_reddit_variables +
                    f" + same_state + same_state*diff_label + {variable} + {variable}*diff_label"),
               data=dataset.astype(np.float64)).fit()
        df = add_results_to_table(res, df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/9 [00:00<?, ?it/s]

Optimization terminated successfully.
         Current function value: 0.690843
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690869
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690737
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690867
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690738
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690866
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690737
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690866
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690734
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690869
  

In [33]:
table2 = df[df.columns[2:]].rename(columns={k: rename(k) for k in df.columns}).transpose().fillna('')

In [34]:
with open("../logit-table-realworld.tex", "w") as f:
    latex = table2.to_latex(header=False, escape=False).replace(r"\toprule", r"""
\toprule
Variable name &  \multicolumn{%d}{c}{Odds ratios. * $p < 0.05$, *** $p < 0.001$} \\
\midrule
    """ % len(table2.columns)).replace("$\Delta$", r"\midrule $\Delta$")
    f.write(latex)

In [35]:
table2

Unnamed: 0,0,1,2,3,4,5,6,7,8
Clinton sup.,0.909***,0.884***,0.883***,0.883***,0.883***,0.884***,0.883***,0.883***,0.884***
Cross-group,1.172***,1.156***,1.134***,1.166***,1.117***,1.162***,1.163***,1.122***,1.145***
"Clinton sup., Cross-group",1.104***,1.162***,1.163***,1.162***,1.165***,1.164***,1.165***,1.162***,1.164***
Avg. score (author),1.176***,1.174***,1.175***,1.175***,1.175***,1.174***,1.174***,1.174***,1.173***
Avg. score (target),1.169***,1.151***,1.151***,1.151***,1.151***,1.151***,1.151***,1.151***,1.151***
Diff. avg. score,1.232***,1.208***,1.209***,1.209***,1.210***,1.209***,1.209***,1.209***,1.209***
Frac. positive (author),1.243***,1.208***,1.211***,1.211***,1.207***,1.209***,1.207***,1.212***,1.208***
Frac. positive (target),1.191***,1.178***,1.179***,1.179***,1.178***,1.179***,1.178***,1.180***,1.178***
Diff. frac. positive,0.501***,0.488***,0.488***,0.488***,0.488***,0.488***,0.488***,0.488***,0.488***
Same state,1.245***,1.243***,1.240***,1.241***,1.242***,1.239***,1.240***,1.241***,1.242***
