In [1]:
import os
import sys
import pandas as pd
import yaml 
from matplotlib import pyplot as plt
from matplotlib import ticker as mticker
from matplotlib import colors as mcolors
from matplotlib import patches as mpatches
import statsmodels.api as sm
import numpy as np
from itertools import product
import subprocess
import networkx as nx

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
from sklearn.linear_model import LinearRegression

from xgboost import XGBRegressor, XGBClassifier

import shap

with open("../../config.yaml.local", "r") as f:
    LOCAL_CONFIG = yaml.safe_load(f)
with open("../../config.yaml", "r") as f:
    CONFIG = yaml.safe_load(f)
sys.path.append("../python")

import globals
import data_tools as dt
import utils
import emb

LOCAL_PATH = LOCAL_CONFIG["LOCAL_PATH"]
RAW_DATA_PATH = LOCAL_CONFIG["RAW_DATA_PATH"]
DATA_PATH = LOCAL_CONFIG["DATA_PATH"]
R_PATH = LOCAL_CONFIG["R_PATH"]

RUN_R_SCRIPTS = False
OVERWRITE = False


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
posts = dt.get_posts()

mask = (posts['invoiceActionState'] != 'FAILED') & \
    (~posts['bio']) & (~posts['freebie']) & (~posts['saloon']) & \
    (posts['subName'].notnull()) & (posts['subName'] != '') & \
    (~posts['subName'].isin(['jobs', 'ama'])) & \
    (posts['title'] != 'deleted by author') & \
    (globals.data_end - posts['created_at'] >= pd.Timedelta(hours=48))

posts = posts.loc[mask].reset_index(drop=True)
posts['text'] = posts['text'].fillna('')

print(len(posts))

191334


In [3]:
title_embeddings = {}
text_embeddings = {}

for idx, row in posts.iterrows():
    itemId = row['itemId']

    title_emb = np.array(emb.get_embedding_robust(row['title']))
    title_emb = title_emb / np.linalg.norm(title_emb)
    title_embeddings[itemId] = title_emb
    
    text_emb = np.array(emb.get_embedding_robust(row['text']))
    text_emb = text_emb / np.linalg.norm(text_emb)
    text_embeddings[itemId] = text_emb


In [4]:
posts = posts.sort_values(by=['userId', 'created_at'], ascending=[True, True]).reset_index(drop=True)

for idx, row in posts.iterrows():
    created_at = row['created_at']
    userId = row['userId']
    itemId = row['itemId']
    sub_df = posts[
        (posts['userId']==userId) & (posts['created_at']>created_at)
    ].sort_values(by='created_at', ascending=True).head(5).reset_index(drop=True)

    my_embedding = text_embeddings[itemId]

    tot_dist = 0
    if len(sub_df)==5:
        for jdx, sub_row in sub_df.iterrows():
            sub_itemId = sub_row['itemId']
            sub_embedding = text_embeddings[sub_itemId]
            cos_dist = 1 - np.dot(my_embedding, sub_embedding) / (np.linalg.norm(my_embedding) * np.linalg.norm(sub_embedding))
            tot_dist += cos_dist
        avg_dist = tot_dist / 5
        posts.at[idx, 'future_text_dist'] = avg_dist


In [8]:
posts.to_parquet(
    os.path.join(DATA_PATH, "v4v_analysis_data.parquet")
)

In [9]:
posts

Unnamed: 0,itemId,created_at,updated_at,title,text,url,userId,parentId,path,pinId,...,root_is_saloon,n_uploads,hasImageOrLink,sats48,zappers48,downsats48,downzappers48,cost_modifier,comments48,future_text_dist
0,224722,2023-08-14 12:01:32.116000+00:00,2023-08-14 12:11:33.730,Why do many bitcoiners still trust fiat software?,We have all bought into using free and open so...,,9,,224722,,...,,0,True,846.0,15.0,0.0,0.0,0,49.0,0.116494
1,225751,2023-08-15 13:21:49.073000+00:00,2023-08-15 13:31:50.847,Why do many bitcoiners still trust fiat software?,We have all bought into using free and open so...,,9,,225751,,...,,0,True,1949.0,11.0,0.0,0.0,0,20.0,0.145863
2,226502,2023-08-16 13:11:16.003000+00:00,2023-08-16 13:21:17.642,How can I use open source software if I don't ...,There are a lot of great tools available for m...,,9,,226502,,...,,0,True,1473.0,15.0,1.0,1.0,0,43.0,0.000077
3,227213,2023-08-17 11:50:04.118000+00:00,2023-08-17 12:00:05.507,How can I use open source software if I don't ...,There are a lot of great tools available for m...,,9,,227213,,...,,0,True,0.0,0.0,0.0,0.0,0,21.0,0.000051
4,228034,2023-08-18 13:36:36.802000+00:00,2023-08-18 13:46:38.197,How can I use open source software if I don't ...,There are a lot of great tools available for m...,,9,,228034,,...,,0,True,0.0,0.0,1.0,1.0,0,7.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191329,1240929,2025-09-28 19:37:15.109000+00:00,2025-09-28 19:48:32.869,EU Chat Control Worsens,"Germany, Belgium, Latvia, and Italy UNDECIDED....",http://fightchatcontrol.eu,30704,,1240929,,...,,0,False,1339.0,10.0,0.0,0.0,0,26.0,
191330,1245197,2025-10-02 01:04:22.737000+00:00,2025-10-02 01:14:43.416,ArtofProof Launch: A Protocol for Verifiable A...,\n\n![](https://m.stacker.news/110427)\n\n\nA ...,,30726,,1245197,,...,,1,True,1.0,1.0,0.0,0.0,0,0.0,
191331,1245418,2025-10-02 06:31:51.636000+00:00,2025-10-03 18:10:44.556,How high can bitcoin go in October?,,https://cointelegraph.com/news/how-high-can-bi...,30729,,1245418,,...,,0,False,0.0,0.0,0.0,0.0,0,0.0,
191332,1247726,2025-10-03 18:03:38.606000+00:00,2025-10-03 18:19:25.497,A New Bitcoin All-Time High Could Come As Earl...,,https://cryptonews.net/31734271/?utm_source=Cr...,30729,,1247726,,...,,0,False,0.0,0.0,0.0,0.0,0,0.0,
