In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import stats
import prince

In [2]:
df_raw = pd.read_csv('../data/raw/problem_merged_data.csv')

In [3]:
x_train, x_test, y_train, y_test = train_test_split(df_raw.drop('cpe', axis=1), df_raw.get('cpe'), test_size=0.25, random_state=0, shuffle=True)
df_train = pd.concat([x_train, y_train], axis=1)
df_test = pd.concat([x_test, y_train], axis=1)

In [4]:
def apply_boxcox_transformation(df: pd.DataFrame, col: str, lambda_: float = None) -> pd.DataFrame:

    if lambda_ is None:
        df[f'{col}_boxcox'], lambda_ = stats.boxcox(df[col])
        return df, lambda_
    else:
        df[f'{col}_boxcox'] = stats.boxcox(df[col], lambda_)
        return df

In [5]:
for col in ['active_days', 'bid', 'budget']:
    x_train_transformed, lambda_ = apply_boxcox_transformation(x_train, col)
    x_test_transformed = apply_boxcox_transformation(x_test, col, lambda_)

In [6]:
cols_to_transform = ['active_days_boxcox', 'bid_boxcox', 'budget_boxcox']

scaler = StandardScaler()

# Train data
x_train_numeric_std = pd.DataFrame(
    scaler.fit_transform(x_train_transformed[cols_to_transform]),
    index=x_train_transformed.index,
    columns=[f"{col}_std" for col in cols_to_transform]
    )

# Test data
x_test_numeric_std = pd.DataFrame(
    scaler.fit_transform(x_test_transformed[cols_to_transform]),
    index=x_test_transformed.index,
    columns=[f"{col}_std" for col in cols_to_transform]
    )

In [7]:
vectorizer = TfidfVectorizer(stop_words='english')

# Train data
x_train_transformed = x_train_transformed.assign(
    headline_storySummary=x_train_transformed['headline'] + '. ' + x_train_transformed['storySummary']
)
X = vectorizer.fit_transform(x_train_transformed.headline_storySummary)
x_train_transformed_text = pd.DataFrame(
    X.toarray(), index=x_train_transformed.index, columns=vectorizer.get_feature_names_out()
    )

# Test data
x_test_transformed = x_test_transformed.assign(
    headline_storySummary=x_test_transformed['headline'] + '. ' + x_test_transformed['storySummary']
)
X = vectorizer.transform(x_test_transformed.headline_storySummary)
x_test_transformed_text = pd.DataFrame(
    X.toarray(), index=x_test_transformed.index, columns=vectorizer.get_feature_names_out()
    )

In [8]:
famd = prince.FAMD(
    n_components=15,
    n_iter=3,
    copy=True,
    check_input=True,
    random_state=42,
    engine="sklearn",
    handle_unknown="ignore"
)

dataset_for_famd_train = pd.concat([x_train[
    ['channel', 'targetAge', 'targetOs', 'targetGender', 'CATEGORY_1',]
    ], x_train_numeric_std], axis=1)

famd = famd.fit(dataset_for_famd_train)
reduced_dim_train = famd.transform(dataset_for_famd_train)

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

dataset_for_famd_test = pd.concat([x_test[
    ['channel', 'targetAge', 'targetOs', 'targetGender', 'CATEGORY_1',]
    ], x_test_numeric_std], axis=1)
reduced_dim_test = famd.transform(dataset_for_famd_test)


regressor = RandomForestRegressor()
regressor.fit(reduced_dim_train, y_train)
r2_train = r2_score(y_train, regressor.predict(reduced_dim_train))
r2_test = r2_score(y_test, regressor.predict(reduced_dim_test))
rmse_train = np.sqrt(mean_squared_error(y_train, regressor.predict(reduced_dim_train)))
rmse_test = np.sqrt(mean_squared_error(y_test, regressor.predict(reduced_dim_test)))

In [11]:
r2_train, r2_test, rmse_train, rmse_test

(0.8573375399089833,
 0.21262556998647497,
 0.31578362556141865,
 0.79236918862685)

In [12]:
rmse_test, regressor.predict(reduced_dim_test).std(), y_test.std()

(0.79236918862685, 0.5125948327126805, 0.894032545733008)