In [1]:
import os
import sys
import pandas as pd
import yaml 
from matplotlib import pyplot as plt
from matplotlib import ticker as mticker
from matplotlib import colors as mcolors
from matplotlib import patches as mpatches
import statsmodels.api as sm
import numpy as np
from itertools import product
import subprocess
import networkx as nx

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.linear_model import LinearRegression

from xgboost import XGBRegressor, XGBClassifier

import shap

with open("../../config.yaml.local", "r") as f:
    LOCAL_CONFIG = yaml.safe_load(f)
with open("../../config.yaml", "r") as f:
    CONFIG = yaml.safe_load(f)
sys.path.append("../python")

import globals
import data_tools as dt
import utils
import emb

LOCAL_PATH = LOCAL_CONFIG["LOCAL_PATH"]
RAW_DATA_PATH = LOCAL_CONFIG["RAW_DATA_PATH"]
DATA_PATH = LOCAL_CONFIG["DATA_PATH"]
R_PATH = LOCAL_CONFIG["R_PATH"]

RUN_R_SCRIPTS = False
OVERWRITE = False


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = dt.get_post_quality_analysis_data()

df = df.loc[df['title'] != 'deleted by author'].reset_index(drop=True)

print(len(df))

  return x.dt.to_period('W-SAT').dt.start_time
  return x.dt.to_period('W-SAT').dt.start_time


191334


In [3]:
subs = df['subName'].unique().tolist()
sub_embeddings = {}
for sub in subs:
    embedding = np.array(emb.get_embedding_robust(sub))
    embedding = embedding / np.linalg.norm(embedding)
    sub_embeddings[sub] = embedding

In [4]:
title_embeddings = []
text_embeddings = []

for idx, row in df.iterrows():
    title_emb = np.array(emb.get_embedding_robust(row['title']))
    title_emb = title_emb / np.linalg.norm(title_emb)
    title_embeddings.append(title_emb)
    
    text_emb = np.array(emb.get_embedding_robust(row['text']))
    text_emb = text_emb / np.linalg.norm(text_emb)
    text_embeddings.append(text_emb)

title_embeddings = np.array(title_embeddings)
text_embeddings = np.array(text_embeddings)

RuntimeError: Query interrupted

In [None]:
#for idx, row in df.iterrows():
#    title = row['title']
#    text = row['text']
#    sub = row['subName']
#    
#    title_embedding = np.array(emb.get_embedding_robust(title))
#    title_embedding = title_embedding / np.linalg.norm(title_embedding)
#    
#    text_embedding = np.array(emb.get_embedding_robust(text))
#    text_embedding = text_embedding / np.linalg.norm(text_embedding)#
#
#    title_cos_dist = 1 - np.dot(title_embedding, sub_embeddings[sub])
#    text_cos_dist = 1 - np.dot(text_embedding, sub_embeddings[sub])
#
#    df.at[idx, 'title_cos_dist'] = title_cos_dist
#    df.at[idx, 'text_cos_dist'] = text_cos_dist

In [None]:
emb.close_connections()

In [None]:
# scree plot for title embeddings
title_pca = PCA()
title_pca.fit(title_embeddings)
explained_variance = title_pca.explained_variance_ratio_[0:50]
plt.figure(figsize=(6, 4))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o')
plt.title('Title Embeddings: PCA Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid()
#filename = os.path.join(LOCAL_PATH, 'figures', 'fig_scree_plot.pdf')
#plt.savefig(filename, bbox_inches='tight')
plt.show()



In [None]:
# scree plot for text embeddings
text_pca = PCA()
text_pca.fit(text_embeddings)
explained_variance = text_pca.explained_variance_ratio_[0:50]
plt.figure(figsize=(6, 4))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o')
plt.title('Text Embeddings: PCA Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid()
#filename = os.path.join(LOCAL_PATH, 'figures', 'fig_scree_plot.pdf')
#plt.savefig(filename, bbox_inches='tight')
plt.show()



In [None]:
TITLE_PCA_K = 20

title_pca = PCA(n_components=TITLE_PCA_K)
title_pca.fit(title_embeddings)
title_pca_embeddings = title_pca.transform(title_embeddings)

for k in range(TITLE_PCA_K):
    df[f'title_emb_{k}'] = title_pca_embeddings[:, k]

TEXT_PCA_K = 20

text_pca = PCA(n_components=TEXT_PCA_K)
text_pca.fit(text_embeddings)
text_pca_embeddings = text_pca.transform(text_embeddings)

for k in range(TEXT_PCA_K):
    df[f'text_emb_{k}'] = text_pca_embeddings[:, k]

In [None]:
title_emb_cols = [f'title_emb_{k}' for k in range(TITLE_PCA_K)]
text_emb_cols = [f'text_emb_{k}' for k in range(TEXT_PCA_K)]
feature_cols = ['num_words', 'num_img_or_links', 'is_link_post'] + title_emb_cols + text_emb_cols

df['log_sats48'] = np.log1p(df['sats48'])

X = df[feature_cols]
Y = df['log_sats48']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=21)

model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=21
)

In [None]:
model.fit(X, Y)

In [None]:
Y_pred = model.predict(X)

rmse = np.sqrt(mean_squared_error(Y, Y_pred))
r2 = r2_score(Y, Y_pred)

print(f"XGB Model")
print(f"RMSE: {rmse:.4f}")
print(f"R2: {r2:.4f}")
print(f"N: {len(Y)}")

In [None]:
# OLS model for comparison

ols = LinearRegression(fit_intercept=True)
ols.fit(X, Y)
Y_pred_ols = ols.predict(X)
rmse_ols = np.sqrt(mean_squared_error(Y, Y_pred_ols))
r2_ols = r2_score(Y, Y_pred_ols)

print(f"OLS Model")
print(f"RMSE: {rmse_ols:.4f}")
print(f"R2: {r2_ols:.4f}")

In [None]:
feature_importance = pd.Series(
    model.feature_importances_, 
    index=model.feature_names_in_
).sort_values(ascending=False)

fi = feature_importance.rename('importance').reset_index()
fi['feature_group'] = fi['index']
fi.loc[ fi['index'].isin(title_emb_cols), 'feature_group' ] = 'title_embeddings'
fi.loc[ fi['index'].isin(text_emb_cols), 'feature_group' ] = 'text_embeddings'
fi = fi.groupby('feature_group').agg(
    importance = ('importance', 'sum')
).reset_index()

fi


In [None]:
explainer = shap.Explainer(model)
shap_values = explainer(X)

In [None]:
plt.figure(figsize=(6,4))
shap.summary_plot(shap_values, X, max_display=7, show=False)
plt.title("Top 7 Features for XGB Model Predicting Zaps in First 48 Hours")
plt.xlabel("SHAP value (impact on model prediction)")
plt.show()

In [None]:
print("Examples with high values of text_emb_5")
print("")

mydf = df.loc[ df['text_emb_5'] > np.quantile(df['text_emb_5'], 0.9) ].reset_index(drop=True)

for idx, row in mydf.sample(3).iterrows():
    print(f"-------------")
    print(f"Title: {row['title']}")
    print(f"Territory: {row['subName']}")
    print(f"URL: {row['url']}")
    print('')
    print(row['text'])
    print('')

In [None]:
(df['sats48']>0).sum()

In [None]:
print("Examples with low values of text_emb_5")
print("")

mydf = df.loc[ df['text_emb_5'] < np.quantile(df['text_emb_5'], 0.1) ].reset_index(drop=True)

for idx, row in mydf.sample(3).iterrows():
    print(f"-------------")
    print(f"Title: {row['title']}")
    print(f"Territory: {row['subName']}")
    print(f"URL: {row['url']}")
    print('')
    print(row['text'])
    print('')

In [None]:
print("Examples with high values of text_emb_0")
print("")

mydf = df.loc[ df['text_emb_0'] > np.quantile(df['text_emb_0'], 0.9) ].reset_index(drop=True)

for idx, row in mydf.sample(3).iterrows():
    print(f"-------------")
    print(f"Title: {row['title']}")
    print(f"Territory: {row['subName']}")
    print(f"URL: {row['url']}")
    print('')
    print(row['text'])
    print('')

In [None]:
print("Examples with low values of text_emb_0")
print("")

mydf = df.loc[ df['text_emb_0'] < np.quantile(df['text_emb_0'], 0.9) ].reset_index(drop=True)

for idx, row in mydf.sample(3).iterrows():
    print(f"-------------")
    print(f"Title: {row['title']}")
    print(f"Territory: {row['subName']}")
    print(f"URL: {row['url']}")
    print('')
    print(row['text'])
    print('')

In [None]:
print("Examples with high values of text_emb_7")
print("")

mydf = df.loc[ df['text_emb_7'] > np.quantile(df['text_emb_7'], 0.9) ].reset_index(drop=True)

for idx, row in mydf.sample(3).iterrows():
    print(f"-------------")
    print(f"Title: {row['title']}")
    print(f"Territory: {row['subName']}")
    print(f"URL: {row['url']}")
    print('')
    print(row['text'])
    print('')

In [None]:
print("Examples with low values of text_emb_7")
print("")

mydf = df.loc[ df['text_emb_7'] < np.quantile(df['text_emb_7'], 0.1) ].reset_index(drop=True)

for idx, row in mydf.sample(3).iterrows():
    print(f"-------------")
    print(f"Title: {row['title']}")
    print(f"Territory: {row['subName']}")
    print(f"URL: {row['url']}")
    print('')
    print(row['text'])
    print('')

In [None]:
print("Examples with high values of title_emb_1")
print("")

mydf = df.loc[ df['title_emb_1'] > np.quantile(df['title_emb_1'], 0.9) ].reset_index(drop=True)

for idx, row in mydf.sample(3).iterrows():
    print(f"-------------")
    print(f"Title: {row['title']}")
    print(f"Territory: {row['subName']}")
    print(f"URL: {row['url']}")
    print('')


In [None]:
print("Examples with low values of title_emb_1")
print("")

mydf = df.loc[ df['title_emb_1'] < np.quantile(df['title_emb_1'], 0.1) ].reset_index(drop=True)

for idx, row in mydf.sample(3).iterrows():
    print(f"-------------")
    print(f"Title: {row['title']}")
    print(f"Territory: {row['subName']}")
    print(f"URL: {row['url']}")
    print('')


In [None]:
print(f"Diamonds in the rough")
print("")

df['log_sats48_pred'] = Y_pred
df['residual'] = df['log_sats48'] - df['log_sats48_pred']

mydf = df.sort_values(by='residual', ascending=True).head(5).reset_index(drop=True)

print("| Item | Title | Territory |")
print("| ---- | ----- | --------- |")
for idx, row in mydf.iterrows():
    print("| ", end='')
    print(f"https://stacker.news/items/{row['itemId']} | ", end='')
    print(f"{row['title']} | ", end='')
    print(f"{row['subName']} |")


In [None]:
# Output for regression in R
df_out = df.drop(columns=text_emb_cols + title_emb_cols)
out_filename = os.path.join(DATA_PATH, 'objective_quality_analysis.parquet')
df_out.to_parquet(out_filename, index=False)