In [2]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import sklearn
import math

# Ensure the following NLTK data is downloaded
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [3]:
df = pd.read_csv('data.csv')
df = df[df['title'].str.contains('Show HN', na=False)]

In [4]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_and_lemmatize(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(lemmatized)

df['processed_title'] = df['title'].apply(preprocess_and_lemmatize)

In [5]:
def transform_score(score):
    transformed_score = math.log(max(score, 1))
    max_val = math.log(df['score'].max())
    return transformed_score / max_val * 2 - 0.5

def untransform_score(transformed_score):
    max_val = math.log(df['score'].max())
    transformed_score = (transformed_score + 0.5) * max_val / 2
    return math.exp(transformed_score)

df['transformed_score'] = df['score'].apply(transform_score)

In [6]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['processed_title'])
y = df['transformed_score']
X_train, X_test, y_train, y_test, _, title_test, _, score_test = train_test_split(X, y, df['title'], df['score'], test_size=0.2, random_state=42)

In [38]:
model = sklearn.linear_model.SGDRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

correct_count = 0
for y_pred_val, score in zip(y_pred, score_test):
    y_pred_score = untransform_score(y_pred_val)
    if score < 10:
        correct_count += int(y_pred_score < 10)
    elif score < 50:
        correct_count += int(10 <= y_pred_score < 50)
    elif score < 100:
        correct_count += int(50 <= y_pred_score < 100)
    else:
        correct_count += int(100 <= y_pred_score)

accuracy = correct_count / len(y_pred)
print(f"Accuracy: {accuracy}")

print('Accuracy of always predicting 0-10:', len(score_test[score_test < 10]) / len(score_test))

Mean Squared Error: 0.11055383797847552
Accuracy: 0.7951248066229042
Accuracy of always predicting 0-10: 0.7957519755822219


In [30]:
def predict(title):
    processed_title = preprocess_and_lemmatize(title)
    X = vectorizer.transform([processed_title])
    return model.predict(X)[0]

for title, gt_score in zip(title_test, score_test):
    score = untransform_score(predict(title))
    if score > 10 or gt_score > 10:
        print(f"{title} | {round(score)}", gt_score)


Show HN: Simpler access to your music from the web that looks nice | 5 86
Show HN: ARRIVE SDK – Know When Customers Are Arriving | 3 17
Show HN: Reactpack – one command to build your React front end | 5 108
Show HN: Classes for javascript that you'd actually use | 4 52
Show HN: Aura – Machine Learning-driven mindfulness program | 4 16
Show HN: CattlePi automated setup and updates for Raspberry Pi(s) | 4 60
Show HN: A Raspberry Pi Video Streaming Robot | 5 19
Show HN: I made an app for creating fancy app store screenshots | 4 17
Show HN: A map that shows you how far you can go for a given time or distance | 2 455
Show HN: Security Book Reviews - My side project for hackers and makers | 7 25
Show HN: Unicaps - a Python package for CAPTCHA solving | 4 17
Show HN: roundabout.io - Jobs for devs by devs (London) | 4 14
Show HN: Formie – an open source (WIP) form website made with pure HTML/JS/CSS | 7 17
Show HN: YC like button | 5 21
Show HN: Howm – A Vim-like tiling X11 window manager | 4 7