In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Load the data
data_path = '../askscience_data.csv'
data = pd.read_csv(data_path)

# Filling in missing data
data = data.fillna({'body':''})

# Add relevant features
data['title_length'] = data['title'].apply(len)

# Selecting features and the target variable
features = ['upvote_ratio', 'title', 'body', 'title_length']
target = 'score'


# Define a ColumnTransformer to apply different preprocessing to different columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('tfidf_title', TfidfVectorizer(stop_words='english'), 'title'),
#         ('tfidf_body', TfidfVectorizer(stop_words='english'), 'body'),
#         ('num', StandardScaler(), ['upvote_ratio', 'title_length'])
#     ],
#     remainder='drop'  # This drops the columns that we do not transform
# )

preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf_title', TfidfVectorizer(), 'title'),
        ('tfidf_body', TfidfVectorizer(), 'body'),
#         ('num', StandardScaler(), ['upvote_ratio', 'title_length'])
         ('num', StandardScaler(), ['upvote_ratio'])
    ],
    remainder='drop'  # This drops the columns that we do not transform
)

# Create a pipeline that first preprocesses the data then applies a linear regression model
model = make_pipeline(preprocessor, LinearRegression())

# Splitting the data into training and testing sets
X = data[features]
y = data[target].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)
y_pred[y_pred < 0] = 0

mse = mean_squared_log_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSLE: {mse:.2f}, R-squared: {r2:.2f}')



MSLE: 48.13, R-squared: -25.08
