In [15]:
!pip install datasets pandas google-genai numpy tiktoken nltk




[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset
import pandas as pd

from tqdm.auto import tqdm
import pickle

import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import html
import nltk
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    nltk.download('wordnet')



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("OpenPipe/hacker-news") # streaming=True
df = pd.DataFrame(ds['train'][-1_000_000:-1])

stories = df[df.type == 'story']
comments = df[df.type == 'comment']

df.tail(3)

Unnamed: 0,id,type,by,time,title,text,url,score,parent,top_level_parent,descendants,kids,deleted,dead
999996,41813381,comment,marcosdumay,2024-10-11 20:26:21,,Add forced sedentarism into that set.<p>This i...,,,41812891.0,41811263,,,,
999997,41813382,story,turkeynecks,2024-10-11 20:26:24,Mini DOOM-like FPS in BooBoo programming language,,https://www.indiedb.com/games/doomed1,1.0,,41813382,0.0,,,
999998,41813383,comment,davio,2024-10-11 20:26:28,,hims sells the generic version for a fraction ...,,,41813102.0,41811263,,,,


## Pre-processing

In [103]:
bad_comment = ['[flagged]', '[dead]', 'Thanks!', 'Thank you!', 'Yes.', 'No.', 'Yes', 'No', 'Thanks', 'Thank you']
comments['low_quality'] = comments.text.map(lambda x: x in bad_comment) | comments.text.isna()
comments = comments[~comments.low_quality]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comments['low_quality'] = comments.text.map(lambda x: x in bad_comment) | comments.text.isna()


In [104]:
comments['text'] = comments.text.map(lambda x: html.unescape(x))

In [105]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Lowercase, remove punctuation, remove stop words, and lemmatize text."""
    if not isinstance(text, str):
        return "" # Handle non-string values

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

In [108]:
comments['clean_text'] = comments['text'].apply(preprocess_text)

In [107]:
len(comments)

822993

## Get Queries

In [109]:
from keywords import HnKeywords
keywords = HnKeywords.as_dict()
keywords.keys()

dict_keys(['startup_founder_issues', 'specific_tools_methodologies', 'critiques_of_market_research', 'jobs_to_be_done', 'market_research_terms', 'overlapping_terms', 'customer_interviews_terms', 'product_validation_terms'])

In [110]:
search_terms = dict()
for label, phrases in keywords.items():
    clean = [preprocess_text(phrase) for phrase in phrases]
    search_terms[label] = list(set(clean))
search_terms

{'startup_founder_issues': ['launching startup',
  'idea validation',
  'getting first customer',
  'go market strategy',
  'startup pain point',
  'building mvp',
  'startup mistake'],
 'specific_tools_methodologies': ['google form',
  'typeform',
  'surveymonkey',
  'dovetail',
  'qualitative analysis software',
  'nvivo'],
 'critiques_of_market_research': ['useless customer interview',
  'bad user research',
  'outdated market research',
  'time consuming research',
  'market research bias',
  'biased user feedback',
  'flaw market analysis',
  'expensive market research',
  'market research broken',
  'ineffective feedback',
  'problem market research'],
 'jobs_to_be_done': ['job customer hire product',
  'job done case study',
  'understanding job',
  'job done framework',
  'jtbd implementation',
  'jtbd example'],
 'market_research_terms': ['market trend',
  'market segmentation strategy',
  'competitive benchmarking',
  'understanding market dynamic',
  'market research report'

## Search and save

In [126]:
def keyword_search(corpus, queries, pbar=None):
    """Returns a boolean mask for rows that contain any of the given queries."""
    if not queries:
        if pbar is not None:
            pbar.update(0)
        return pd.Series(False, index=corpus.index)
    
    pattern = r'\b(?:' + '|'.join(re.escape(q) for q in queries) + r')\b'
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    
    mask = corpus.str.contains(compiled_pattern, na=False)
    if pbar is not None:
        pbar.update(len(queries))
    return mask


In [127]:
total_query_count = sum(len(terms) for terms in search_terms.values())
pbar = tqdm(total=total_query_count, unit='queries')

comments['labels'] = [[] for _ in range(len(comments))]

# 3. For each category, find rows that match the category queries
for category, queries in search_terms.items():
    mask = keyword_search(comments['clean_text'], queries, pbar=pbar)
    # Append the category label to each matching row's 'labels' list
    comments.loc[mask, 'labels'] = comments.loc[mask, 'labels'].apply(
        lambda current_list: current_list + [category]
    )
pbar.close()

100%|██████████| 185/185 [01:22<00:00,  2.24queries/s]


In [None]:
# Pickles the comment IDs and their matches
timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
comments['labels_str'] = comments['labels'].apply(lambda labels: '|'.join(labels))
comments_with_labels = comments[comments['labels_str'] != '']
comments_with_labels[['id', 'labels_str']].to_csv(
    f'out/comments_with_labels_{timestamp}.csv', index=False
)
SAVED_DATA = True

## Load (optional)

In [None]:
import glob
import os

# Load the most recent CSV file
if not SAVED_DATA:
    list_of_files = glob.glob('out/comments_with_labels_*.csv')
    latest_file = max(list_of_files, key=os.path.getctime)
    comments_with_labels = pd.read_csv(latest_file)


# EDA and post-processing

In [33]:
labeled_comments = pd.merge(comments,
                           comments_with_labels[['id', 'labels_str']],
                           on='id',
                           how='inner')
labeled_comments['labels'] = labeled_comments['labels_str'].str.split('|')
labeled_comments

Unnamed: 0,id,type,by,time,title,text,url,score,parent,top_level_parent,descendants,kids,deleted,dead,labels_str,labels
0,40814477,comment,freedomben,2024-06-27 20:02:27,,I&#x27;ve struggled philosophically with that ...,,,40814409.0,40812695,,"[40814535, 40818971, 40814580]",,,overlapping_terms,[overlapping_terms]
1,40815826,comment,whit537,2024-06-27 22:21:59,,"Yes! We aim to launch <a href=""https:&#x2F;&#x...",,,40815121.0,40810949,,,,,overlapping_terms,[overlapping_terms]
2,40816458,comment,al_borland,2024-06-27 23:53:51,,One I thought was kind of silly that I made wa...,,,40816400.0,40816400,,,,,overlapping_terms,[overlapping_terms]
3,40816541,comment,kragen,2024-06-28 00:10:10,,you ask what i mean about programmer productiv...,,,40812631.0,40804122,,"[40817362, 40817247]",,,overlapping_terms,[overlapping_terms]
4,40818011,comment,canpan,2024-06-28 05:29:58,,I use it for similar reasons! But I do not hav...,,,40817724.0,40817199,,"[40819480, 40818194]",,,overlapping_terms,[overlapping_terms]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1790,41809964,comment,tivert,2024-10-11 14:49:22,,&gt; &quot;We&#x27;ll know our disinformation ...,,,41809578.0,41807121,,,,,specific_tools_methodologies,[specific_tools_methodologies]
1791,41810229,comment,makowskid,2024-10-11 15:16:01,,"For the last couple of months, I&#x27;m workin...",,,41690087.0,41690087,,,,,overlapping_terms,[overlapping_terms]
1792,41810632,comment,ranger_danger,2024-10-11 15:56:36,,&gt; didn&#x27;t need H100s<p>I think we&#x27;...,,,41806368.0,41805446,,,,,overlapping_terms,[overlapping_terms]
1793,41811404,comment,jauntywundrkind,2024-10-11 17:22:37,,Still a shit poor pathetic excuse to screw ove...,,,41811226.0,41809698,,[41811540],,,customer_interviews_terms,[customer_interviews_terms]


In [None]:
# 1) Collect all unique labels
all_labels = sorted({lbl for row in labeled_comments['labels'] for lbl in row})

# 2) Create a column for each label: 1 if present in 'labels', else 0
for lbl in all_labels:
    labeled_comments[lbl] = labeled_comments['labels'].apply(lambda row_labels: 1 if lbl in row_labels else 0)

# 3) Build the co-occurrence matrix via dot-product
#    This creates an NxN matrix where N = number of unique labels
co_occ_matrix = labeled_comments[all_labels].T.dot(labeled_comments[all_labels])
co_occ_matrix

Unnamed: 0,customer_interviews_terms,jobs_to_be_done,market_research_terms,overlapping_terms,product_validation_terms,specific_tools_methodologies,startup_founder_issues
customer_interviews_terms,242,1,1,34,6,0,2
jobs_to_be_done,1,4,0,1,0,0,0
market_research_terms,1,0,242,5,1,1,0
overlapping_terms,34,1,5,1005,8,4,0
product_validation_terms,6,0,1,8,267,0,7
specific_tools_methodologies,0,0,1,4,0,76,0
startup_founder_issues,2,0,0,0,7,0,26


In [None]:
# todo: read comments with interesting overlap - e.g. issues and customer interviews
