In [28]:
import os
import sys
import pandas as pd
import yaml 
from matplotlib import pyplot as plt
from matplotlib import ticker as mticker
from matplotlib import colors as mcolors
from matplotlib import patches as mpatches
import statsmodels.api as sm
import numpy as np
from itertools import product
import subprocess
import networkx as nx

with open("../../config.yaml.local", "r") as f:
    LOCAL_CONFIG = yaml.safe_load(f)
#with open("../../config.yaml", "r") as f:
#    CONFIG = yaml.safe_load(f)
sys.path.append("../python")

import globals
import data_tools as dt

LOCAL_PATH = LOCAL_CONFIG["LOCAL_PATH"]
RAW_DATA_PATH = LOCAL_CONFIG["RAW_DATA_PATH"]
DATA_PATH = LOCAL_CONFIG["DATA_PATH"]
R_PATH = LOCAL_CONFIG["R_PATH"]

RUN_R_SCRIPTS = False
OVERWRITE = False


In [29]:
posts = dt.get_posts()
DG = dt.get_internal_digraph()

node_df = pd.DataFrame({
    'itemId': list(DG.nodes),
    'in_degree': [DG.in_degree(n) for n in DG.nodes],
    'out_degree': [DG.out_degree(n) for n in DG.nodes]
})
node_df['itemId'] = node_df['itemId'].astype(int)

posts = posts.merge(node_df, on='itemId', how='left')
posts['in_degree'] = posts['in_degree'].fillna(0).astype(int)
posts['out_degree'] = posts['out_degree'].fillna(0).astype(int)

In [30]:
mask = (posts['invoiceActionState'] != 'FAILED') & \
    (~posts['bio']) & (~posts['freebie']) & (~posts['saloon']) & \
    (~posts['subName'].isin(['jobs', 'ama'])) 
posts = posts.loc[mask].reset_index(drop=True)

# generate post metrics
posts['text'] = posts['text'].fillna('')
posts['num_img_or_links'] = posts['text'].apply(dt.count_image_or_links)
posts['num_words'] = posts['text'].apply(lambda x: len(x.split()))
posts['is_link_post'] = (posts['url'].notnull()) & (posts['url'] != '')
posts['link_only'] = posts['is_link_post'] & (posts['text'].str.strip() == '')

# keep columns
keep_cols = [
    'itemId', 'userId', 'subName', 'created_at',
    'title', 'text', 'url', 
    'cost', 'sats48', 'comments48', 
    'downsats48', 'downzappers48',
    'num_img_or_links', 'num_words', 'is_link_post', 'link_only', 'in_degree', 'out_degree'
]

posts = posts[keep_cols]

In [31]:
posts.to_parquet(
    os.path.join(DATA_PATH, "objective_quality_analysis.parquet"), index=False
)