In [None]:
#Importing the required libraries

import featuretools as ft
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('data_basket.csv') #Reading the merged and cleaned dataset 

In [None]:
df.columns #Display the dataset columns

In [None]:
#Defining the product entity

df_product = df[['product_id', 'manufacturer', 'department', 'brand','commodity_desc', 'sub_commodity_desc']].drop_duplicates()

In [None]:
df_product

In [None]:
#Defining the household entity

df_household = df[['household_key', 'age_desc', 'marital_status_code',
                   'income_desc', 'homeowner_desc', 'hh_comp_desc', 'household_size_desc',
                   'kid_category_desc']].drop_duplicates()

In [None]:
df_household

In [None]:
#Defining the transaction entity 

df_transaction = df[['household_key', 'basket_id', 'date_time', 'product_id', 'quantity',
                     'store_id', 'sales_value', 'discount', 'basket_sales_value',
                     'basket_discount', 'week_no']]

In [None]:
df_transaction.reset_index(inplace = True, drop = False)

In [None]:
df_transaction

In [None]:
#Defining the entityset

es = ft.EntitySet(id = 'Testing')

In [None]:
#Defining the entities and its primary keys

es = es.add_dataframe(dataframe_name = 'product', dataframe = df_product, index = 'product_id')
es = es.add_dataframe(dataframe_name = 'household', dataframe = df_household, index = 'household_key')
es = es.add_dataframe(dataframe_name = 'transaction', dataframe = df_transaction, index = 'index')

In [None]:
#Defining the relationships between entities

es = es.add_relationship('household', 'household_key', 'transaction', 'household_key')
es = es.add_relationship('product', 'product_id', 'transaction', 'product_id')

In [None]:
#Generating the features and put it all in one matrix or list

feature_matrix, feature_defs = ft.dfs(entityset = es,
                                     target_dataframe_name = 'transaction')

In [None]:
feature_matrix

In [None]:
feature_defs

In [None]:
X = feature_matrix.iloc[:, 25:] #Features that we want to analyse, all non-float columns are disregarded
y = feature_matrix['basket_sales_value'] #Target variable 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #Splitting the train and test set

In [None]:
#Initializing the Random Forest Regressor algorithm with the default parameters

rf_regressor = RandomForestRegressor(n_estimators=10, random_state=42)
rf_regressor.fit(X_train, y_train)

In [None]:
#Using the algorithm to get the feature importances

feature_importances = pd.Series(rf_regressor.feature_importances_, index=X.columns)

In [None]:
#Putting the importances of features into one dataframe for analysis convenience

feature_importances_pd = pd.DataFrame({"Features" : pd.DataFrame(X_train).columns, "Importances" : feature_importances})

In [None]:
feature_importances_pd.set_index("Importances")

In [None]:
feature_importances_pd = feature_importances_pd.sort_values("Importances")

In [None]:
feature_importances_pd

In [None]:
#Plotting the importances of features into one bar chart

feature_importances_pd.tail(10).plot.bar(color = 'teal') #The importances sorted

In [None]:
#Setting the threshold 0.05

sfm = SelectFromModel(rf_regressor, threshold=0.05) 
sfm.fit(X_train, y_train)

In [None]:
X_train_selected = sfm.transform(X_train)
X_test_selected = sfm.transform(X_test)

In [None]:
#Using the get_support() function to display the features that exceed the threshold value

selected_feature_names = X.columns[sfm.get_support()]

In [None]:
selected_feature_names #Outputting the selected features based on the Random Forest Regressor algorithm