In [37]:
import pandas as pd
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

with open('./Models/column_transformer.pkl', 'rb') as x:
    ct = pickle.load(x)

with open('./Data/X_train.pkl', 'rb') as x:
    X_train = pickle.load(x)

with open('./Data/X_test.pkl', 'rb') as x:
    X_test = pickle.load(x)
    
with open('./Data/y_train.pkl', 'rb') as x:
    y_train = pickle.load(x)
    
with open('./Data/y_test.pkl', 'rb') as x:
    y_test = pickle.load(x)

In [16]:
posts_df = pd.read_csv('./Data/posts_with_date.csv')

In [19]:
X = posts_df['title']
y = posts_df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

#### Basic KNN Regressor

In [25]:
pipe_knn = Pipeline([
    ('cv', CountVectorizer()),
    ('knn', KNeighborsRegressor())
])

In [26]:
pipe_knn.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()), ('knn', KNeighborsRegressor())])

In [27]:
pipe_knn.score(X_train, y_train)

0.18268023773034592

In [28]:
pipe_knn.score(X_test, y_test)

-0.3652955606578576

These numbers are both far below baseline

#### Basic Logistic Regressor

In [34]:
pipe_lr = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=1000))
])

In [35]:
pipe_lr.fit(X_train, y_train)
pipe_lr.score(X_train, y_train), pipe_lr.score(X_test, y_test)

(0.9710087159863946, 0.9696293343961738)

The testing accuracy exceeds baseline. LR seems like a promising model to tune

#### Basic Random Forest Regressor

In [39]:
pipe_rf = Pipeline([
    ('cv', CountVectorizer()),
    ('rf', RandomForestRegressor())
])

In [40]:
pipe_rf.fit(X_train, y_train)
pipe_rf.score(X_train, y_train), pipe_rf.score(X_test, y_test)

(0.8463848118644273, -0.09813894018395763)

These numbers are both far below baseline

#### Basic Extra Trees Classifier

In [45]:
pipe_et = Pipeline([
    ('cv', CountVectorizer()),
    ('et', ExtraTreesRegressor(n_jobs=-1))

])

In [47]:
pipe_et.fit(X_train, y_train)
pipe_et.score(X_train, y_train), pipe_et.score(X_test, y_test)

(0.9623147617826653, -0.09939177437719038)