<a href="https://colab.research.google.com/github/codemishka/Prediction_of_Product_Sales/blob/main/Untitled21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prediction of Product Sales
> Mishka Janghbahadur
>version 6.0 (Project Final Core)

## EDA Functions from Lessons

### Univariate EDA Functions

In [None]:
# Basic imports for functions
import matplotlib.pyplot as plt
import seaborn as sns

# UNIVARIATE PLOTTING FUNCTIONS FOR EDA
def explore_categorical(df, x, fillna = True, placeholder = 'MISSING',
                        figsize = (6,4), order = None):
  """Creates a seaborn countplot with the option to temporarily fill missing values
  Prints statements about null values, cardinality, and checks for
  constant/quasi-constant features.
  Source:{PASTE IN FINAL LESSON LINK}
  """
  # Make a copy of the dataframe and fillna
  temp_df = df.copy()
  # Before filling nulls, save null value counts and percent for printing
  null_count = temp_df[x].isna().sum()
  null_perc = null_count/len(temp_df)* 100
  # fillna with placeholder
  if fillna == True:
    temp_df[x] = temp_df[x].fillna(placeholder)
  # Create figure with desired figsize
  fig, ax = plt.subplots(figsize=figsize)
  # Plotting a count plot
  sns.countplot(data=temp_df, x=x, ax=ax, order=order)
  # Rotate Tick Labels for long names
  ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
  # Add a title with the feature name included
  ax.set_title(f"Column: {x}", fontweight='bold')

  # Fix layout and show plot (before print statements)
  fig.tight_layout()
  plt.show()

  # Print null value info
  print(f"- NaN's Found: {null_count} ({round(null_perc,2)}%)")
  # Print cardinality info
  nunique = temp_df[x].nunique()
  print(f"- Unique Values: {nunique}")

  # First find value counts of feature
  val_counts = temp_df[x].value_counts(dropna=False)
  # Define the most common value
  most_common_val = val_counts.index[0]
  # Define the frequency of the most common value
  freq = val_counts.values[0]
  # Calculate the percentage of the most common value
  perc_most_common = freq / len(temp_df) * 100

  # Print the results
  print(f"- Most common value: '{most_common_val}' occurs {freq} times ({round(perc_most_common,2)}%)")
  # print message if quasi-constant or constant (most common val more than 98% of data)
  if perc_most_common > 98:
    print(f"\n- [!] Warning: '{x}' is a constant or quasi-constant feature and should be dropped.")
  else:
    print("- Not constant or quasi-constant.")
  return fig, ax


def explore_numeric(df, x, figsize=(6,5) ):
  """Creates a seaborn histplot and boxplot with a share x-axis,
  Prints statements about null values, cardinality, and checks for
  constant/quasi-constant features.
  Source:{PASTE IN FINAL LESSON LINK}
  """

  ## Save null value counts and percent for printing
  null_count = df[x].isna().sum()
  null_perc = null_count/len(df)* 100


  ## Making our figure with gridspec for subplots
  gridspec = {'height_ratios':[0.7,0.3]}
  fig, axes = plt.subplots(nrows=2, figsize=figsize,
                           sharex=True, gridspec_kw=gridspec)
  # Histogram on Top
  sns.histplot(data=df, x=x, ax=axes[0])

  # Boxplot on Bottom
  sns.boxplot(data=df, x=x, ax=axes[1])

  ## Adding a title
  axes[0].set_title(f"Column: {x}", fontweight='bold')

  ## Adjusting subplots to best fill Figure
  fig.tight_layout()

  # Ensure plot is shown before message
  plt.show()


  # Print null value info
  print(f"- NaN's Found: {null_count} ({round(null_perc,2)}%)")
  # Print cardinality info
  nunique = df[x].nunique()
  print(f"- Unique Values: {nunique}")


  # Get the most most common value, its count as # and as %
  most_common_val_count = df[x].value_counts(dropna=False).head(1)
  most_common_val = most_common_val_count.index[0]
  freq = most_common_val_count.values[0]
  perc_most_common = freq / len(df) * 100

  print(f"- Most common value: '{most_common_val}' occurs {freq} times ({round(perc_most_common,2)}%)")

  # print message if quasi-constant or constant (most common val more than 98% of data)
  if perc_most_common > 98:
    print(f"\n- [!] Warning: '{x}' is a constant or quasi-constant feature and should be dropped.")
  else:
    print("- Not constant or quasi-constant.")
  return fig, axes


### Multivariate Feature vs. Target Functions.

In [None]:
"""MULTIVARIATE PLOTTING FUNCTIONS VS. NUMERIC TARGET"""

def plot_categorical_vs_target(df, x, y='charges',figsize=(6,4),
                            fillna = True, placeholder = 'MISSING',
                            order = None):
  """Plots a combination of a seaborn barplot of means combined with
  a seaborn stripplot to show the spread of the data.
  Source:{PASTE IN FINAL LESSON LINK}
  """
  # Make a copy of the dataframe and fillna
  temp_df = df.copy()
  # fillna with placeholder
  if fillna == True:
    temp_df[x] = temp_df[x].fillna(placeholder)

  # or drop nulls prevent unwanted 'nan' group in stripplot
  else:
    temp_df = temp_df.dropna(subset=[x])
  # Create the figure and subplots
  fig, ax = plt.subplots(figsize=figsize)

    # Barplot
  sns.barplot(data=temp_df, x=x, y=y, ax=ax, order=order, alpha=0.6,
              linewidth=1, edgecolor='black', errorbar=None)

  # Boxplot
  sns.stripplot(data=temp_df, x=x, y=y, hue=x, ax=ax,
                order=order, hue_order=order, legend=False,
                edgecolor='white', linewidth=0.5,
                size=3,zorder=0)
  # Rotate xlabels
  ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

  # Add a title
  ax.set_title(f"{x} vs. {y}", fontweight='bold')
  fig.tight_layout()
  return fig, ax


def plot_numeric_vs_target(df, x, y='charges',
                           figsize=(6,4)):
  """Plots a seaborn regplot with Pearson's correlation (r) added
  to the title.
  Source:{PASTE IN FINAL LESSON LINK}
  """
  # Calculate the correlation
  corr = df[[x,y]].corr().round(2)
  r = corr.loc[x,y]

  # Plot the data
  fig, ax = plt.subplots(figsize=figsize)
  scatter_kws={'ec':'white','lw':1,'alpha':0.8}
  sns.regplot(data=df, x=x, y=y, ax=ax, scatter_kws=scatter_kws)

  ## Add the title with the correlation
  ax.set_title(f"{x} vs. {y} (r = {r})", fontweight='bold')

  # Make sure the plot is shown before the print statement
  plt.show()

  return fig, ax

## Import Libraries

In [None]:
#Importing the necessary libraries
import pandas as pd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split

In [None]:
## Typical Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Modeling & preprocessing import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer,make_column_transformer,make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

## Load Data

In [None]:
 from google.colab import drive
 drive.mount('/content/drive')

In [None]:
fname='/content/drive/MyDrive/CodingDojo/01-Fundamentals/Week02/Data/sales_predictions_2023.csv'
df = pd.read_csv(fname)

In [None]:
 from google.colab import drive
 drive.mount('/content/drive')

## Data Inspection

In [None]:
df.info()
df.head()

In [None]:
# Get the shape of the DataFrame
num_rows, num_columns = df.shape

# Print using f-strings
print(f"Number of rows: {num_rows}\nNumber of columns: {num_columns}")

In [None]:
#Drop Duplicates and check that no duplicates exist
df.drop_duplicates(inplace = True)
df.duplicated().sum()


In [None]:
df.dtypes
#no columns are unnamed

In [None]:
data_types = df.dtypes

# Create an empty dictionary to group columns by data type
columns_by_type = {}

# Group columns by data type
for column_name, data_type in data_types.iteritems():
    if data_type not in columns_by_type:
        columns_by_type[data_type] = [column_name]
    else:
        columns_by_type[data_type].append(column_name)

# Print columns grouped by data type
print("Columns grouped by data type:")
for data_type, columns in columns_by_type.items():
    print(f"{data_type}: {', '.join(columns)}")

In [None]:
#Looking for nulls in each column and convert to percentage
null_sums = df.isna().sum()
null_sums

null_percentage = null_sums/len(df) * 100
null_percentage

In [None]:
#Visualizing the % nulls
msno.matrix(df);

## Data Cleaning

### Ordinal and Categorical Features

In [None]:
# Identify string columns
string_cols = df.select_dtypes("object").columns
string_cols

# Obtain the value counts for all string columns
for col in string_cols:
  print(f"Value Counts for {col}")
  print(df[col].value_counts())
  # Increasing readability by adding an empty line
  print('\n')


In [None]:
# Item_Identifier does not require furthur adjustments
# Item_Fat_Content- ordinal values should be either low or regular fat- edit required.
# Item_Type does not require furthur adjustments
# Outlet_Identifier does not require furthur adjustments
# Outlet_Size is ordinal- "High" needs to be renamed to "Large"-edit required
# Outlet_Location_Type is ordinal but doesn't require editing
# Outlet_Type- ordinal values should grouped by supermarket type- edit required

In [None]:
#Cleaning Item_Fat_Content
rename_dict = {"low fat": "Low Fat",
               "LF": "Low Fat",
               "reg": "Regular"}

#Item_Fat_Content values are inconsistent, going to standardize with .str.replace
#Convert using .str.replace("old","new")
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace(rename_dict, regex=True)
df['Item_Fat_Content'].value_counts()
#We now have standardized our Item_Fat_Content feature

In [None]:
#Converting outlet size of "high" to "large"
df["Outlet_Size"].replace({"High":"Large"}, inplace=True)
df["Outlet_Size"].value_counts()

In [None]:
#Cleaning Outlet_Type
#Converting outlet size of "high" to "large"
df["Outlet_Type"].replace({"Grocery Store":"Supermarket Type4"}, inplace=True)
df["Outlet_Type"].value_counts()

### Numeric Features

In [None]:
# Getting the description for numeric features
description = df.describe().round(2)
description

In [None]:
# Limit output to min, max, 25% and 75%
description.loc[['min','25%','75%','max']]

In [None]:
# Address *Item_Outlet_Sales: The max value is much higher than the 75th percentile
# CHECK item outlet sales (Item_Outlet_Sales >= 10 000)
filter_high_price = df['Item_Outlet_Sales'] >= 10000
df[filter_high_price]

In [None]:
# Checking 5 highest prices to compare  outlier values
df.sort_values("Item_Outlet_Sales", ascending=False).head()

#It doesn't appear like the max value is an outlier- no need to edit

## Data Visualisation

Using histograms and boxplots to visualize numeric data

In [None]:
# Copy of dataframe just for visualization and EDA
temp_df = df.copy()

In [None]:
x = 'Item_Weight'
placeholder = 'MISSING'
# In our temporatory df, we will fill in the missing values in the Item_Weight column with the placeholder, MISSING
temp_df[x] = temp_df[x].fillna(placeholder)

In [None]:

x = 'Outlet_Size'
placeholder = 'MISSING'
# In our temporatory df, we will fill in the missing values in the Outlet_Size column with the placeholder, MISSING
temp_df[x] = temp_df[x].fillna(placeholder)

Numeric Features

In [None]:
#Item_Weight
#Item_Visibility
#Item_MRP
#Outlet_Establishment_Year
#Item_Outlet_Sales

In [None]:

#Histograms to view the distributions of numerical features in your dataset.
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

# Plot histograms
plt.figure(figsize=(12, 8))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(2, 3, i)
    df[feature].hist(bins=30, edgecolor='k')
    plt.title(feature)
    plt.xlabel(feature)
    plt.ylabel('Count')

plt.tight_layout()
plt.show()
plt.style.use('dark_background')

In [None]:
# Set up a custom pastel color palette
pastel_palette = sns.color_palette("pastel", len(df.select_dtypes(include=['int64', 'float64']).columns))

# Numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

# Plot histograms with different pastel colors
plt.figure(figsize=(12, 8))
for i, feature in enumerate(numerical_features, 1):
    plt.subplot(2, 3, i)
    sns.histplot(data=df, x=feature, bins=30, edgecolor='k', kde=True, color=pastel_palette[i-1])  # Use the custom color
    plt.title(feature)
    plt.xlabel(feature)
    plt.ylabel('Count')

plt.tight_layout()
plt.show()
plt.style.use('dark_background')

In [None]:
# Set Seaborn's pastel color palette
pastel_palette = sns.color_palette("pastel", len(df.select_dtypes(include=['int64', 'float64']).columns))

# Numerical features
numerical_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year', 'Item_Outlet_Sales']

# Creating pastel-colored boxplots for each numerical feature
for feature in numerical_features:
    plt.figure(figsize=(6, 4))  # Adjust the figure size for better visibility
    sns.boxplot(data=df[feature], palette="pastel")
    plt.title(f'Boxplot of {feature}')
    plt.ylabel('Value')
    plt.tight_layout()  # Adjust layout
    plt.show()
    plt.style.use('dark_background')








In [None]:
# Set Seaborn's pastel color palette
sns.set_palette("pastel")

# Categorical features
categorical_features = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

# Creating countplots with pastel color palette
for feature in categorical_features:
    plt.figure(figsize=(10, 6))  # Adjust the figure size for better visibility
    sns.countplot(data=df, x=feature, palette="pastel")
    plt.title(f'Countplot of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.tight_layout()  # Adjust layout
    plt.show()
    plt.style.use('dark_background')



In [None]:
#Heatmap to view the correlation between features.
correlation_matrix = df.corr(numeric_only = True)

# Set Seaborn's pastel color palette
sns.set_palette("pastel")

# Plot the heatmap with pastel colors
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
plt.style.use('dark_background')

# **Explanatory Data Analysis**

In [None]:
# Set up the color palette
palette = sns.color_palette("Set1", n_colors=len(df['Outlet_Type'].unique()))

# Create the scatter plot with regression line
sns.lmplot(x="Item_MRP", y="Item_Outlet_Sales", hue="Outlet_Type", data=df,
           scatter_kws={"edgecolor": "white"}, line_kws={"color": "blue"},
           palette=palette, height=6, aspect=1.5)

# Show the plot
plt.show()
plt.style.use('dark_background')

In [None]:
#Plotting each Item Type
g = sns.lmplot(df, y="Item_Outlet_Sales", x="Item_MRP", hue="Item_Type",
               scatter_kws={"edgecolor":"white"},aspect=1.5,
               col="Item_Type", col_wrap=2)
plt.style.use('dark_background')

In [None]:
fig, ax = plt.subplots()
ax=sns.histplot(df, x="Item_Type", hue="Item_Type");
ax.set_title("Types of Products", fontweight="bold");
#Putting $ and , on Sales axis
fmt = '${x:,.0f}'
tick = mtick.StrMethodFormatter(fmt)
ax.yaxis.set_major_formatter(tick)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right");''
plt.style.use('dark_background')

In [None]:
sns.lmplot(df, y="Item_Outlet_Sales", x="Item_MRP", hue="Item_Type", scatter_kws={"edgecolor":"black"});
plt.title("Sales vs. Item MRP & Item Type", fontweight="bold");
plt.style.use('dark_background')

In [None]:
g = sns.catplot(data=df, y="Item_Outlet_Sales", x="Item_Type", hue="Item_Type")
#Putting $ and , on Sales axis
fmt = '${x:,.0f}'
tick = mtick.StrMethodFormatter(fmt)
ax.yaxis.set_major_formatter(tick)
g.set_xticklabels(label="Item_Type", rotation=45, ha="right");
g.tight_layout();
plt.style.use('dark_background')

In [None]:
ax= sns.barplot(data=df, y="Item_Outlet_Sales", x="Item_Type");
#Putting $ and , on Sales axis
fmt = '${x:,.0f}'
tick = mtick.StrMethodFormatter(fmt)
ax.yaxis.set_major_formatter(tick)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right");
ax.grid(ls="--");
plt.style.use('dark_background')

In [None]:
ax = sns.boxenplot(data=df, x="Item_Outlet_Sales", y="Item_Type")
#Putting $ and , on Sales axis
fmt = '${x:,.0f}'
tick = mtick.StrMethodFormatter(fmt)
ax.xaxis.set_major_formatter(tick)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right");
plt.style.use('dark_background')

## Machine Learning Model

In [None]:
#Copy Path and dataset from scratch

fpath="/content/drive/MyDrive/CodingDojo/01-Fundamentals/Week02/Data/sales_predictions_2023.csv"
df_copy = pd.read_csv(fpath)
df_copy.head()

In [None]:
#using information from heatmat correlation, drop unnecessary features.
df_copy.drop(columns=["Item_Identifier", "Item_Weight", "Outlet_Establishment_Year",
                  "Item_Fat_Content"], axis=1, inplace=True)

In [None]:
# Dropping missing values in salary_in_usd column
# Must drop these values that are missing since this
# column is our target

df_copy = df_copy.dropna(subset = ['Item_Outlet_Sales'], how = 'all')

In [None]:
## Define X and y
target = 'Item_Outlet_Sales'

X = df_copy.drop(columns=target).copy()
y = df_copy[target].copy()
X.head()

In [None]:
#Drop Duplicates and check that no duplicates exist
df_copy.drop_duplicates(inplace = True)
df.duplicated().sum()


In [None]:
#Converting outlet size of "high" to "large"
df_copy["Outlet_Size"].replace({"High":"Large"}, inplace=True)
df_copy["Outlet_Size"].value_counts()

In [None]:
#Cleaning Outlet_Type
#Converting outlet size of "high" to "large"
df_copy["Outlet_Type"].replace({"Grocery Store":"Supermarket Type4"}, inplace=True)
df_copy["Outlet_Type"].value_counts()

##Linear Regression

### Create Pipeline for Numericals

In [None]:
# Perfoming a train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [None]:
display(X_train.head(), y_train.head())

In [None]:
cat_selector = make_column_selector(dtype_include = 'object')
cat_selector(X_train)

In [None]:
#impute_cat = SimpleImputer(strategy='constant', fill_value = "Missing")
impute_cat = SimpleImputer(strategy='most_frequent')
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)

cat_pipe = make_pipeline(impute_cat,encoder)
cat_pipe

In [None]:
cat_pipe.fit_transform(X_train[cat_selector(X_train)])

In [None]:
# Creating a numeric data selector
num_selector = make_column_selector(dtype_include='number')
num_selector(X_train)

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit_transform(X_train[num_selector(X_train)])

In [None]:
preprocessor = make_column_transformer((cat_pipe,cat_selector),
                                       (scaler,num_selector))
preprocessor

In [None]:
df_copy.head()

In [None]:
string_cols = df_copy.select_dtypes("object").columns
string_cols

In [None]:
for col in string_cols:
  print(f"Value Counts for {col}")
  print(df_copy[col].value_counts())
  # Increasing readability by adding an empty line
  print('\n')

In [None]:
df_copy.dtypes

In [None]:
# Define features and target
X = df_copy.drop(columns = 'Item_Outlet_Sales')
y = df_copy['Item_Outlet_Sales']
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)


### Making a Preprocessing Pipeline

In [None]:
cat_selector = make_column_selector(dtype_include = 'object')
cat_selector(X_train)

In [None]:
impute_cat =  SimpleImputer(strategy='constant', fill_value = "Missing")
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)

cat_pipe = make_pipeline(impute_cat,encoder)
cat_pipe

In [None]:
cat_pipe.fit_transform(X_train[cat_selector(X_train)])

In [None]:
num_selector = make_column_selector(dtype_include='number')
num_selector(X_train)

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit_transform(X_train[num_selector(X_train)])

In [None]:
preprocessor = make_column_transformer((cat_pipe,cat_selector),
                                       (scaler,num_selector))
preprocessor

In [None]:
df.isna().sum()

In [None]:
preprocessor.fit_transform(X_train)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
# Define features matrix
X = df.drop(columns ='Item_Outlet_Sales')
y = df['Item_Outlet_Sales']
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
# Preview training data
X_train.head()

In [None]:
# Define the custom functions for regressoin evaluation
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

In [None]:
# Get list of numeric columns and instantiate a StandardScaler
num_cols = X_train.select_dtypes('number').columns
scaler = StandardScaler()
num_imputer = SimpleImputer(strategy='mean')
# Construct the tuple for column transformer with the scaler
num_pipe = make_pipeline(num_imputer,scaler)
num_tuple = ('numeric',num_pipe, num_cols)
num_tuple



In [None]:
# Saving list of categorical columns
cat_cols = X_train.select_dtypes('object').columns
# Constructing categorical preprocessing objects
cat_imputer = SimpleImputer(strategy='constant', fill_value='MISSING')
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_pipe = make_pipeline(cat_imputer,ohe_encoder)
cat_tuple = ('cat',cat_pipe, cat_cols)
cat_tuple


In [None]:
# Instantiate the preprocessor/ColumnTransformer
preprocessor = ColumnTransformer([num_tuple, cat_tuple],
                                 verbose_feature_names_out=False)
preprocessor


In [None]:
# Fit the preprocessor on training data
preprocessor.fit(X_train)
# Transform the training and test data
X_train_tf = preprocessor.transform(X_train)
X_test_tf = preprocessor.transform(X_test)
#X_train_tf.head()


In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg


In [None]:
# Fit the model on the training data
lin_reg.fit(X_train_tf, y_train)


In [None]:
# Get predictions for the training data
y_predictions_train = lin_reg.predict(X_train_tf)
# Get predictions for the testing data
y_predictions_test = lin_reg.predict(X_test_tf)


In [None]:
def custom_evaluation(lin_reg, X_train, y_train, X_test, y_test):
    y_train_pred = lin_reg.predict(X_train_tf)
    y_test_pred = lin_reg.predict(X_test_tf)

    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)

    return r2_train, r2_test



In [None]:
train_r2, test_r2 = custom_evaluation(lin_reg, X_train, y_train, X_test, y_test)

print("Training R-squared:", train_r2)
print("Test R-squared:", test_r2)

# Compare the R-squared values to determine if the model is overfitting or underfitting
if train_r2 > test_r2:
    print("The model might be overfitting.")
elif train_r2 < test_r2:
    print("The model might be underfitting.")
else:
    print("The model's performance is balanced.")


In [None]:
from sklearn.ensemble import RandomForestRegressor  # Import RandomForestRegressor

# Instantiate default random forest model
rf = RandomForestRegressor(random_state = 42)
# Model Pipeline
rf_pipe = make_pipeline(preprocessor, rf)


In [None]:
# Fit the model pipeline on the training data only
rf_pipe.fit(X_train, y_train)



In [None]:
# Use custom function to evaluate default model
evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test)



In [None]:
# Parameters for tuning
rf_pipe.get_params()



In [None]:
# Define param grid with options to try
params = {'randomforestregressor__max_depth': [None,10,15,20],
          'randomforestregressor__n_estimators':[10,100,150,200],
          'randomforestregressor__min_samples_leaf':[2,3,4],
          'randomforestregressor__max_features':['sqrt','log2',None],
          'randomforestregressor__oob_score':[True,False],
          }


In [None]:
from sklearn.model_selection import GridSearchCV  # Import GridSearchCV
# Instantiate the gridsearch
gridsearch = GridSearchCV(rf_pipe, params, n_jobs=-1, cv = 3, verbose=1)
# Fit the gridsearch on training data
gridsearch.fit(X_train, y_train)



Fitting 3 folds for each of 288 candidates, totalling 864 fits


In [None]:


gridsearch.best_params_

In [None]:

from sklearn.model_selection import GridSearchCV  # Import GridSearchCV

# Define and refit best model
best_rf = gridsearch.best_estimator_
evaluate_regression(best_rf, X_train, y_train, X_test, y_test)

