In [31]:
import dataclasses

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import lightgbm
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import RFECV

In [4]:
@dataclasses.dataclass
class Data:
    train: pd.DataFrame
    test: pd.DataFrame

In [5]:
FILES = ('discounts_train.parq.gzip', 'discounts_test.parq.gzip')

sns.set_style("darkgrid")
pd.options.display.max_columns = 100

In [8]:
data = Data(pd.read_parquet(FILES[0]), pd.read_parquet(FILES[1]))

In [9]:
data.train.head()

Unnamed: 0,spend_1m_baby,spend_2m_baby,spend_3m_baby,spend_4m_baby,spend_5m_baby,spend_6m_baby,spend_1m_clothes,spend_2m_clothes,spend_3m_clothes,spend_4m_clothes,spend_5m_clothes,spend_6m_clothes,spend_1m_health,spend_2m_health,spend_3m_health,spend_4m_health,spend_5m_health,spend_6m_health,spend_1m_pet,spend_2m_pet,spend_3m_pet,spend_4m_pet,spend_5m_pet,spend_6m_pet,spend_1m_groceries,spend_2m_groceries,spend_3m_groceries,spend_4m_groceries,spend_5m_groceries,spend_6m_groceries,spend_1m_eletronic,spend_2m_eletronic,spend_3m_eletronic,spend_4m_eletronic,spend_5m_eletronic,spend_6m_eletronic,sales,discount,profit,age,gender,cust_state,tenure,sales_prediction_bins,sales_prediction
0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,45,15,30.0,55.0,10.0,160.0,0,0,0,0.0,0.0,0.0,3368.64,125.0,40.06,40,1.0,MA,6,8,1351.024765
1,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,5,5,0.0,0.0,10.0,0.0,0,0,0,0.0,0.0,0.0,2133.1,75.0,29.52,36,0.0,MG,10,4,1035.580387
2,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,25,30,35.0,0.0,20.0,50.0,0,0,0,0.0,0.0,0.0,2001.62,50.0,48.08,34,0.0,RJ,7,3,992.401825
3,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,55,135,120.0,10.0,40.0,130.0,0,0,0,0.0,0.0,0.0,1461.96,10.0,61.64,31,0.0,BA,7,1,919.720735
4,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,25,0,10.0,95.0,25.0,20.0,0,0,0,0.0,0.0,0.0,2743.72,100.0,34.44,32,1.0,PB,6,6,1176.485681


In [28]:
unused = ['sales', 'discount', 'profit', 'sales_prediction_bins', 'sales_prediction']
keep = data.train.columns.drop(unused)
X = data.train[keep].fillna(0)
y = data.train['profit']

In [32]:
cat_processor = ColumnTransformer(
    [
        ('ohe', OneHotEncoder(), ['cust_state'])
    ],
    remainder='passthrough'
)

pl = Pipeline([
    ('preprocessor', cat_processor),
    ('scaler', StandardScaler()),
    ('feature_selection', RFECV(estimator=LinearRegression()))
])

pl.fit(X, y)

In [45]:
mask = pl.named_steps.feature_selection.get_support()
rfe_select = pl.named_steps.preprocessor.get_feature_names_out()[mask]

In [47]:
rfe_select

array(['ohe__cust_state_AC', 'ohe__cust_state_AL', 'ohe__cust_state_AM',
       'ohe__cust_state_AP', 'ohe__cust_state_BA', 'ohe__cust_state_CE',
       'ohe__cust_state_DF', 'ohe__cust_state_ES', 'ohe__cust_state_GO',
       'ohe__cust_state_MA', 'ohe__cust_state_MG', 'ohe__cust_state_MS',
       'ohe__cust_state_MT', 'ohe__cust_state_PA', 'ohe__cust_state_PB',
       'ohe__cust_state_PE', 'ohe__cust_state_PI', 'ohe__cust_state_PR',
       'ohe__cust_state_RJ', 'ohe__cust_state_RN', 'ohe__cust_state_RO',
       'ohe__cust_state_RR', 'ohe__cust_state_RS', 'ohe__cust_state_SC',
       'ohe__cust_state_SE', 'ohe__cust_state_SP', 'ohe__cust_state_TO',
       'remainder__spend_1m_baby', 'remainder__spend_6m_baby',
       'remainder__spend_1m_clothes', 'remainder__spend_2m_clothes',
       'remainder__spend_3m_clothes', 'remainder__spend_4m_clothes',
       'remainder__spend_5m_clothes', 'remainder__spend_6m_clothes',
       'remainder__spend_1m_health', 'remainder__spend_2m_health',
      