In [33]:
# Import libraries
import pandas as pd
import numpy as np

# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Selection
from sklearn.model_selection import train_test_split

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Warinings
import warnings
warnings.filterwarnings("ignore")

In [34]:
# Load data
data = pd.read_csv('clean_data.csv')
data.sample()

Unnamed: 0,index,food_category,food_department,food_family,store_sales,store_cost,unit_sales,promotion_name,sales_country,marital_status,...,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,media_type,cost
19606,19606,Snack Foods,Snack Foods,Food,5.24,1.834,4.0,Sales Galore,Mexico,M,...,20141.0,6393.0,4262.0,1.0,1.0,1.0,1.0,1.0,"Daily Paper, Radio",62.4


In [35]:
num = data.select_dtypes(include=np.number).columns.to_list()   # Numeric values
cat = data.select_dtypes(exclude=np.number).columns.to_list()   #Categorical values

# Remove index and cost column
num.remove('index')
num.remove('avg_cars_at_home1')
num.remove('store_sqft')
num.remove('grocery_sqft')
num.remove('frozen_sqft')
num.remove('meat_sqft')
num.remove('cost')

# Declare X and Y variables 
x = data[num + cat]
y = data['cost']

# Train set and Test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [36]:
# Numeric pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse=True, handle_unknown='ignore'))
])

# ColumnTransformer for numeric and categorical values
preprocessing_pipeline = ColumnTransformer([
    ('num_preprocessing', num_pipeline, num),
    ('cat_preprocessing', cat_pipeline, cat)
])

In [37]:
# Import models
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

# model = Lasso(alpha = 0.1)
# model = LinearRegression(fit_intercept=True)

model = DecisionTreeRegressor() # Best score

# Final pipeline
complete_pipeline = Pipeline([
    ('preprocessor', preprocessing_pipeline),
    ('estimator', model),
])

In [38]:
# Fit and test model pipeline
complete_pipeline.fit(x_train, y_train)
y_pred = complete_pipeline.predict(x_test)

# Print all scores
print('Train score: ',complete_pipeline.score(x_train, y_train))
print('Test score: ',complete_pipeline.score(x_test, y_test))

Train score:  1.0
Test score:  0.9979522740883948


In [None]:
# Export model.pkl
import pickle

pickle.dump(complete_pipeline, open('model.pkl', 'wb'))