In [17]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings("ignore")

In [18]:
data = pd.read_csv('clean_data.csv')
data.sample()

Unnamed: 0,index,food_category,food_department,food_family,store_sales,store_cost,unit_sales,promotion_name,sales_country,marital_status,...,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,media_type,cost
47110,47110,Candy,Snacks,Food,5.43,2.6607,3.0,High Roller Savings,USA,S,...,15337.0,5011.0,3340.0,1.0,1.0,1.0,1.0,1.0,Daily Paper,101.84


In [19]:
continuo = data.select_dtypes(exclude = np.number)

In [20]:
continuo = continuo.select_dtypes(exclude = np.object)

In [21]:
num = data.select_dtypes(include=np.number).columns.to_list()
cat = data.select_dtypes(exclude=np.number).columns.to_list()
num.remove('index')
num.remove('cost')

x = data[num + cat]
y = data['cost']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [22]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse=True, handle_unknown='ignore'))
])

preprocessing_pipeline = ColumnTransformer([
    ('num_preprocessing', num_pipeline, num),
    ('cat_preprocessing', cat_pipeline, cat)
])

In [23]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()

complete_pipeline = Pipeline([
    ('preprocessor', preprocessing_pipeline),
    ('estimator', model),
])

In [25]:
complete_pipeline.fit(x_train, y_train)
y_pred = complete_pipeline.predict(x_test)
print('Train score: ',complete_pipeline.score(x_train, y_train))
print('Test score: ',complete_pipeline.score(x_test, y_test))

Train score:  1.0
Test score:  0.9985605054790069
