In [1]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import (
    OrdinalEncoder,
    StandardScaler,
)
from scipy.stats import loguniform

In [3]:
numeric_features = ['n_words']
text_feature = 'Text'
ordinal_features = ['sentiment']
drop_features = ['Id', 'Author']
target = 'Rating'

In [4]:
train_df = pd.read_csv("../Data/processed/train.csv")
X_train, y_train = train_df.drop(columns=[target] + drop_features), train_df[target]

In [5]:
 preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(max_features=20_000, max_df=0.6), text_feature),
            ('num', StandardScaler(), numeric_features),
            ('ord', OrdinalEncoder(categories=[['neg', 'compound', 'neu', 'pos']]), ordinal_features)
        ]
    )
ml_pipe = Pipeline(
        steps=[
            ("prepro", preprocessor),
            ("Ridge", Ridge())
        ]
    )
ml_pipe

Pipeline(steps=[('prepro',
                 ColumnTransformer(transformers=[('text',
                                                  CountVectorizer(max_df=0.6,
                                                                  max_features=20000),
                                                  'Text'),
                                                 ('num', StandardScaler(),
                                                  ['n_words']),
                                                 ('ord',
                                                  OrdinalEncoder(categories=[['neg',
                                                                              'compound',
                                                                              'neu',
                                                                              'pos']]),
                                                  ['sentiment'])])),
                ('Ridge', Ridge())])

In [6]:
 param_grid = {
        'Ridge__alpha': 10.0 ** np.arange(-3, 3)
    }

In [7]:
hyper_parameters_search = GridSearchCV(ml_pipe, param_grid=param_grid, n_jobs=-1, scoring='r2', verbose=1)

In [8]:
hyper_parameters_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   35.8s finished


GridSearchCV(estimator=Pipeline(steps=[('prepro',
                                        ColumnTransformer(transformers=[('text',
                                                                         CountVectorizer(max_df=0.6,
                                                                                         max_features=20000),
                                                                         'Text'),
                                                                        ('num',
                                                                         StandardScaler(),
                                                                         ['n_words']),
                                                                        ('ord',
                                                                         OrdinalEncoder(categories=[['neg',
                                                                                                     'compound',
                 

In [9]:
hyper_parameters_search.best_estimator_

Pipeline(steps=[('prepro',
                 ColumnTransformer(transformers=[('text',
                                                  CountVectorizer(max_df=0.6,
                                                                  max_features=20000),
                                                  'Text'),
                                                 ('num', StandardScaler(),
                                                  ['n_words']),
                                                 ('ord',
                                                  OrdinalEncoder(categories=[['neg',
                                                                              'compound',
                                                                              'neu',
                                                                              'pos']]),
                                                  ['sentiment'])])),
                ('Ridge', Ridge(alpha=100.0))])

In [11]:
results_df = {}
scores = cross_validate(
    hyper_parameters_search.best_estimator_,
    X_train,
    y_train,
    scoring='r2',
    return_train_score=True)

df = pd.DataFrame(scores)
results_df["Ridge"] = df.mean()
pd.DataFrame(results_df).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Ridge,1.706095,0.410828,0.531897,0.968792
