In [15]:
!pip install datasets pandas google-genai numpy tiktoken nltk




[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset
import pandas as pd
# import dask.dataframe as pd
import numpy as np

from tqdm.auto import tqdm
import pickle

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import html
import nltk
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# df = pd.read_parquet('hf://datasets/OpenPipe/hacker-news')
ds = load_dataset("OpenPipe/hacker-news") # streaming=True
df = pd.DataFrame(ds['train'][-3_000_000:-1_000_000])

stories = df[df.type == 'story']
comments = df[df.type == 'comment']

df.tail(3)

Unnamed: 0,id,type,by,time,title,text,url,score,parent,top_level_parent,descendants,kids,deleted,dead
1999997,40813382,comment,pavon,2024-06-27 18:15:13,,proto2 allowed both required fields and option...,,,40812948.0,40798740,,"[40813554, 40816081]",,
1999998,40813383,comment,ju-st,2024-06-27 18:15:15,,Sorry I wasn&#x27;t talking about density but ...,,,40811561.0,40803783,,,,
1999999,40813384,story,belter,2024-06-27 18:15:16,"Astronauts take shelter in Starliner, other sp...",,https://www.space.com/iss-astronauts-shelter-r...,130.0,,40813384,130.0,"[40813633, 40815312, 40813760, 40813850, 40816...",,


## Pre-processing

In [5]:
bad_comment = ['[flagged]', '[dead]', 'Thanks!', 'Thank you!', 'Yes.', 'No.', 'Yes', 'No', 'Thanks', 'Thank you']
comments['low_quality'] = comments.text.map(lambda x: x in bad_comment) | comments.text.isna()
comments = comments[~comments.low_quality]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comments['low_quality'] = comments.text.map(lambda x: x in bad_comment) | comments.text.isna()


In [6]:
comments['text'] = comments.text.map(lambda x: html.unescape(x))

In [7]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Lowercase, remove punctuation, remove stop words, and lemmatize text."""
    if not isinstance(text, str):
        return "" # Handle non-string values

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

In [8]:
comments['clean_text'] = comments['text'].apply(preprocess_text)

In [9]:
len(comments)

1690439

## Get Queries

In [10]:
from keywords import HnKeywords
keywords = HnKeywords.as_dict()
keywords.keys()

dict_keys(['startup_founder_issues', 'specific_tools_methodologies', 'critiques_of_market_research', 'jobs_to_be_done', 'market_research', 'overlapping_terms', 'customer_interviews', 'product_validation', 'books'])

In [11]:
search_terms = dict()
for label, phrases in keywords.items():
    clean = [preprocess_text(phrase) for phrase in phrases]
    search_terms[label] = list(set(clean))
search_terms

{'startup_founder_issues': ['getting first customer',
  'launching startup',
  'startup pain point',
  'startup mistake',
  'go market strategy',
  'idea validation',
  'building mvp'],
 'specific_tools_methodologies': ['qualitative analysis software',
  'nvivo',
  'google form',
  'typeform',
  'dovetail',
  'surveymonkey'],
 'critiques_of_market_research': ['useless customer interview',
  'expensive market research',
  'ineffective feedback',
  'flaw market analysis',
  'problem market research',
  'time consuming research',
  'market research bias',
  'outdated market research',
  'market research broken',
  'biased user feedback',
  'bad user research'],
 'jobs_to_be_done': ['jtbd implementation',
  'understanding job',
  'job done case study',
  'job done framework',
  'jtbd example',
  'job customer hire product'],
 'market_research': ['market gap',
  'market trend',
  'analyzing market',
  'understanding market size',
  'market research report',
  'focus group',
  'competitive l

## Search and save

In [12]:
def keyword_search(corpus, queries, pbar=None):
    """Returns a boolean mask for rows that contain any of the given queries."""
    if not queries:
        if pbar is not None:
            pbar.update(0)
        return pd.Series(False, index=corpus.index)
    
    pattern = r'\b(?:' + '|'.join(re.escape(q) for q in queries) + r')\b'
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    
    mask = corpus.str.contains(compiled_pattern, na=False)
    if pbar is not None:
        pbar.update(len(queries))
    return mask


In [13]:
total_query_count = sum(len(terms) for terms in search_terms.values())
pbar = tqdm(total=total_query_count, unit='queries')

comments['labels'] = [[] for _ in range(len(comments))]

# 3. For each category, find rows that match the category queries
for category, queries in search_terms.items():
    mask = keyword_search(comments['clean_text'], queries, pbar=pbar)
    # Append the category label to each matching row's 'labels' list
    comments.loc[mask, 'labels'] = comments.loc[mask, 'labels'].apply(
        lambda current_list: current_list + [category]
    )
pbar.close()

100%|██████████| 187/187 [02:48<00:00,  1.11queries/s]


In [14]:
# Pickles the comment IDs and their matches
timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
comments['labels_str'] = comments['labels'].apply(lambda labels: '|'.join(labels))
comments_with_labels = comments[comments['labels_str'] != '']
comments_with_labels[['id', 'labels_str']].to_csv(
    f'out/comments_with_labels_{timestamp}.csv', index=False
)
SAVED_DATA = True

## Load (optional)

In [4]:
import glob
import os

# Load the most recent CSV file
list_of_files = glob.glob('out/comments_with_labels_*.csv')
latest_file = max(list_of_files, key=os.path.getctime)
comments_with_labels = pd.read_csv(latest_file)


## EDA and post-processing

In [5]:
labeled_comments = pd.merge(comments,
                           comments_with_labels[['id', 'labels_str']],
                           on='id',
                           how='inner')
if 'labels' not in labeled_comments:
    labeled_comments['labels'] = labeled_comments['labels_str'].str.split('|')
labeled_comments.head(2)

Unnamed: 0,id,type,by,time,title,text,url,score,parent,top_level_parent,descendants,kids,deleted,dead,labels_str,labels
0,38813410,comment,willis936,2023-12-30 07:27:01,,Laziness is the enemy. I spend a lot of time ...,,,38812794.0,38812244,,,,,overlapping_terms,[overlapping_terms]
1,38813425,comment,kstrauser,2023-12-30 07:30:46,,"I just stuck it on a public server, behind a B...",,,38813361.0,38795735,,[38823447],,,overlapping_terms,[overlapping_terms]


In [6]:
# 1) Collect all unique labels
all_labels = sorted({lbl for row in labeled_comments['labels'] for lbl in row})

# 2) Create a column for each label: 1 if present in 'labels', else 0
for lbl in all_labels:
    labeled_comments[lbl] = labeled_comments['labels'].apply(lambda row_labels: 1 if lbl in row_labels else 0)

# 3) Build the co-occurrence matrix via dot-product
#    This creates an NxN matrix where N = number of unique labels
co_occ_matrix = labeled_comments[all_labels].T.dot(labeled_comments[all_labels])
co_occ_matrix

Unnamed: 0,books,critiques_of_market_research,customer_interviews,jobs_to_be_done,market_research,overlapping_terms,product_validation,specific_tools_methodologies,startup_founder_issues
books,180,0,2,0,1,2,2,0,1
critiques_of_market_research,0,2,0,0,1,1,0,0,0
customer_interviews,2,0,564,0,8,39,5,1,1
jobs_to_be_done,0,0,0,9,0,1,0,0,0
market_research,1,1,8,0,437,18,7,1,1
overlapping_terms,2,1,39,1,18,1477,6,0,2
product_validation,2,0,5,0,7,6,460,0,12
specific_tools_methodologies,0,0,1,0,1,0,0,161,0
startup_founder_issues,1,0,1,0,1,2,12,0,64


### Save relevant comments

In [8]:
timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
labels_file = f'out/full_labeled_comments_{timestamp}.parquet'
labeled_comments.to_parquet(labels_file)

## Write to HTML

In [1]:
def find_ancestors(df, row_id) -> list[int]:
    entry = df[df['id'] == int(row_id)]

    if len(entry) == 0:
        return []
    elif len(entry) > 1:
        raise ValueError(f"Multiple rows with id {row_id}.")
    entry = entry.iloc[0]
    if entry.parent > 0:
        return [entry.id] + find_ancestors(df, entry.parent)
    else:
        return [entry.id]

find_ancestors(df,41813270)


NameError: name 'df' is not defined

In [19]:
import htmlgen

htmlgen.create_html_from_comments(labeled_comments[['id', 'text', 'labels']], title=" Hacker News Comments")

HTML file saved to: out/comments_20250111_202454.html


In [None]:
labeled_comments.head()

Unnamed: 0,id,type,by,time,title,text,url,score,parent,top_level_parent,...,dead,labels_str,labels,customer_interviews_terms,jobs_to_be_done,market_research_terms,overlapping_terms,product_validation_terms,specific_tools_methodologies,startup_founder_issues
0,40814477,comment,freedomben,2024-06-27 20:02:27,,I&#x27;ve struggled philosophically with that ...,,,40814409.0,40812695,...,,overlapping_terms,[overlapping_terms],0,0,0,1,0,0,0
1,40815826,comment,whit537,2024-06-27 22:21:59,,"Yes! We aim to launch <a href=""https:&#x2F;&#x...",,,40815121.0,40810949,...,,overlapping_terms,[overlapping_terms],0,0,0,1,0,0,0
2,40816458,comment,al_borland,2024-06-27 23:53:51,,One I thought was kind of silly that I made wa...,,,40816400.0,40816400,...,,overlapping_terms,[overlapping_terms],0,0,0,1,0,0,0
3,40816541,comment,kragen,2024-06-28 00:10:10,,you ask what i mean about programmer productiv...,,,40812631.0,40804122,...,,overlapping_terms,[overlapping_terms],0,0,0,1,0,0,0
4,40818011,comment,canpan,2024-06-28 05:29:58,,I use it for similar reasons! But I do not hav...,,,40817724.0,40817199,...,,overlapping_terms,[overlapping_terms],0,0,0,1,0,0,0
