In [55]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import sklearn
import math

# Ensure the following NLTK data is downloaded
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [11]:
df = pd.read_csv('data.csv')
df = df[df['title'].str.contains('Show HN', na=False)]

In [12]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_and_lemmatize(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words and lemmatize
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(lemmatized)

df['processed_title'] = df['title'].apply(preprocess_and_lemmatize)

In [35]:
def transform_score(score):
    transformed_score = math.log(max(score, 1))
    max_val = math.log(df['score'].max())
    return transformed_score / max_val * 2 - 0.5

def untransform_score(transformed_score):
    max_val = math.log(df['score'].max())
    transformed_score = (transformed_score + 0.5) * max_val / 2
    return math.exp(transformed_score)

df['transformed_score'] = df['score'].apply(transform_score)

In [48]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['processed_title'])
y = df['transformed_score']
X_train, X_test, y_train, y_test, _, title_test, _, score_test = train_test_split(X, y, df['title'], df['score'], test_size=0.2, random_state=42)

In [58]:
model = sklearn.linear_model.Ridge()
model.fit(X_train, y_train)

In [59]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.11860096483763959


In [60]:
def predict(title):
    processed_title = preprocess_and_lemmatize(title)
    X = vectorizer.transform([processed_title])
    return model.predict(X)[0]

for title, gt_score in zip(title_test, score_test):
    score = untransform_score(predict(title))
    print(f"{title} - Predicted Score: {score}", ' - Actual Score:', gt_score)


Show HN: I made a simple tool to help you generate chord progressions - Predicted Score: 30.541829786942845  - Actual Score: 2
Show HN: FalsiScan – Make it look like a PDF has been hand signed and scanned - Predicted Score: 22.019850957531542  - Actual Score: 770
Show HN: Satoshipay, Seamless Micropayments for the Web - Predicted Score: 22.442620619850416  - Actual Score: 2
Show HN: I made a solar-powered, ePaper photo frame - Predicted Score: 157.60873591436868  - Actual Score: 27
Show HN: Revisions 2.0 Mac menu bar app to view Dropbox edits, revert if needed - Predicted Score: 53.845682940927524  - Actual Score: 4
Show HN: See personal details for all individuals black-listed by the US govt - Predicted Score: 31.660647449368675  - Actual Score: 2
Show HN: Wayland Explorer – Easily read Wayland protocol documentation online - Predicted Score: 42.907740220677496  - Actual Score: 44
Show HN: Science News - HN Clone Built With Drum (my first Django site) - Predicted Score: 22.22380613054