In [36]:
import numpy as np
import pandas as pd
import torch
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from transformers import BertModel, BertTokenizer

In [37]:
# Custom transformer for converting text into embeddings using a BERT model.
class BertEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='bert-base-uncased'): # model with good balance between performance and resource demands
        self.model_name = model_name
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.to(self.device)

    def transform(self, X, **transform_params):
        self.model.eval()
        embeddings = []
        for doc in tqdm(X, desc="Processing documents"):
            with torch.no_grad():
                encoded_input = self.tokenizer(doc, padding=True, truncation=True, max_length=512, return_tensors='pt').to(self.device)
                output = self.model(**encoded_input)
                embeddings.append(output.pooler_output.cpu().numpy())
        return np.vstack(embeddings)
    
    def fit(self, X, y=None):
        return self


In [38]:
# Load the data
data_path = '../askscience_data.csv'
data = pd.read_csv(data_path)

# Filling in missing data
data = data.fillna({'body':''})

# Add relevant features
data['title_length'] = data['title'].apply(len)

# Selecting features and the target variable
features = ['upvote_ratio', 'title', 'body', 'title_length']
target = 'score'


preprocessor = ColumnTransformer(
    transformers=[
        ('bert_title', BertEmbeddingTransformer(), 'title'),
        ('bert_body', BertEmbeddingTransformer(), 'body'),
        ('num', StandardScaler(), ['upvote_ratio', 'title_length'])
    ],
    remainder='drop'  # This drops the columns that we do not transform
)

# Create a pipeline that first preprocesses the data then applies a linear regression model
model = make_pipeline(preprocessor, LinearRegression())

# Splitting the data into training and testing sets
X = data[features]
y = data[target].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)
y_pred[y_pred < 0] = 0

mse = mean_squared_log_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSLE: {mse:.2f}, R-squared: {r2:.2f}')


Processing documents: 100%|█████████████████| 3364/3364 [02:15<00:00, 24.91it/s]
Processing documents: 100%|█████████████████| 3364/3364 [05:29<00:00, 10.21it/s]
Processing documents: 100%|███████████████████| 841/841 [00:33<00:00, 24.80it/s]
Processing documents: 100%|███████████████████| 841/841 [00:47<00:00, 17.76it/s]

MSLE: 14.52, R-squared: 0.38



