<a href="https://colab.research.google.com/github/divyam123-EECS-Physics/Water-Bottle-Price-Prediction/blob/main/Pipeline_and_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import pointbiserialr
from sklearn.metrics import matthews_corrcoef
pd.pandas.set_option('display.max_columns', None)

In [None]:
base_file = '/content/drive/MyDrive/TFProject/'
train_df_path = base_file + 'water-bottle-train-set.csv'
test_df_path = base_file + 'water-bottle-test-set.csv'

In [None]:
class missing_values(BaseEstimator, TransformerMixin):
  
  def __init__(self):
    self.non_ohe_features = ['Brand', 'Water Source', 'Container', 'Flavor', 'Pack Size', 'Price', 'Beverage Size']
    self.ohe_features = []
  def fit(self, X, y = None):

    self.ohe_features = [col for col in X.columns if (col not in self.non_ohe_features and col not in ['Name', 'Falvor','Preservatice Free'])]
    return self

  def transform(self, X, y = None):
    print('mv')
    if ('Name' in X.columns.unique()) and ('Falvor' in X.columns.unique()) and ('Preservatice Free' in X.columns.unique()):
      X = X.drop(columns = ['Name', 'Falvor','Preservatice Free'])

    if 'Water Source' in X.columns.unique():
      X['Water Source'] = X['Water Source'].fillna('local')

    if 'Flavor' in X.columns.unique():
      X['Flavor'] = X['Flavor'].fillna('None')
    for feature in self.ohe_features:
      if (feature in X.columns.unique()) and X[feature].isnull().sum() >= 1:
        X[feature] = X[feature].fillna(0)
    
    print(X.isnull().sum())    
    
    return X

In [None]:
class DropSimilarOHEFeatures(BaseEstimator, TransformerMixin):
  
  def __init__(self, threshold = 0.85):
    self.features_to_remove = []
    self.threshold = threshold
    self.non_ohe_features = ['Brand', 'Water Source', 'Container', 'Flavor', 'Pack Size', 'Price', 'Beverage Size']
    
  def fit(self, X, y = None):
    ohe_bool_features = [col for col in X.columns if (col not in self.non_ohe_features)]
    col_corr = dict()
    corr_matrix = X[ohe_bool_features].corr(matthews_corrcoef)
    if self.features_to_remove:
      for i in range(len(corr_matrix.columns)):
          for j in range(i):
              if abs(corr_matrix.iloc[i, j]) > self.threshold: # we are interested in absolute coeff value
                  colname = corr_matrix.columns[i]  # getting the name of column
                  colname2 = corr_matrix.columns[j]
                  col_corr[colname] = colname2
      self.features_to_remove = col_corr 
    return self

  def transform(self, X, y = None):
    for feature in self.features_to_remove:
      if feature in X.columns.unique():
        X = X.drop(columns = [feature])
    print('dsf')
    #print(X.isnull().sum())    

    return X

In [None]:
class DropRedundantOHEfeatures(BaseEstimator, TransformerMixin):
  
  def __init__(self, KBest = 10):
    self.features_to_remove = []
    self.non_ohe_features = ['Brand', 'Water Source', 'Container', 'Flavor', 'Pack Size', 'Price','Beverage Size']
    self.KBest = KBest
  def fit(self, X, y):
    print('drf')
    ohe_bool_features = [col for col in X.columns if ((col not in self.non_ohe_features))]
    temp = X.copy()
    temp['Price'] = y
    p_val_dict = dict()
    if self.features_to_remove == []:  
      for feature in ohe_bool_features:

        dicot_data = temp[feature]
        cont_data = temp['Price']
        corr, p_val = pointbiserialr(dicot_data, cont_data)
        p_val_dict[feature] = p_val 
      sorted_p_val_dict = sorted(p_val_dict.items(), key = lambda kv: kv[1])
      remove_p_vap_dict = sorted_p_val_dict[self.KBest:]
      self.features_to_remove = [col[0] for col in remove_p_vap_dict]

    return self

  def transform(self, X, y = None):
    print('drf')
    for feature in self.features_to_remove:
      if feature in X.columns.unique():
        X = X.drop(columns = [feature])
    return X

In [None]:
class MeanEncoder(BaseEstimator, TransformerMixin):
  
  def __init__(self):
    self.non_ohe_features = ['Brand', 'Water Source', 'Container', 'Flavor', 'Pack Size', 'Beverage Size']
    self.mapping_list = []

  def fit(self, X, y):
    temp = X.copy()
    temp['Price'] = y
    for feature in self.non_ohe_features:
      if feature in temp.columns.unique():
        mean_encoded_subject = temp.groupby([feature])['Price'].mean().to_dict()
        self.mapping_list.append((feature, mean_encoded_subject))

    return self

  def transform(self, X, y = None):
    print('me')
    temp = X.copy()
    temp['Price'] = y
    for feature, mean_encoded_subject in self.mapping_list:
      if feature in temp.columns.unique():
        a = temp[feature].unique().tolist()
        b = mean_encoded_subject.keys()
        c = [d for d in a if d not in b]
        if c != []:
          for element in c:
            mean_encoded_subject[element] = temp[temp[feature] == element]['Price'].mean()
        temp[feature + 'Mean_Encoded'] = temp[feature].map(mean_encoded_subject)
        temp = temp.drop(columns = [feature])
    return temp.drop(columns = ['Price'])

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler

In [None]:
train_df = pd.read_csv(train_df_path)

X_train = train_df.loc[:, train_df.columns != 'Price']
y_train = np.log(train_df['Price'])

test_df = pd.read_csv(test_df_path)

X_test = test_df.loc[:, test_df.columns != 'Price']
y_test = np.log(test_df['Price'])


In [None]:
for col in test_df.columns:
  if test_df[col].isnull().sum() >= 1:
    print(col)

Water Source
Falvor
Flavor
No Artificial Preservatives
Preservative FREE
No Artificial Flavors
No Artificial Sweeteners


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.decomposition import PCA

In [None]:
from sklearn.svm import SVR

In [None]:
from sklearn.ensemble import RandomForestRegressor


In [None]:
lr = Pipeline([('missing_vals', missing_values()),
               ('drop_similar', DropSimilarOHEFeatures()),
               ('drop_redundant', DropSimilarOHEFeatures()),
               ('me', MeanEncoder()),
               ('Min_Max', MinMaxScaler()),
               ('Model', LinearRegression())
             ])
               
svr = Pipeline([('missing_vals', missing_values()),
               ('drop_similar', DropSimilarOHEFeatures()),
               ('drop_redundant', DropSimilarOHEFeatures()),
               ('me', MeanEncoder()),
               ('Min_Max', MinMaxScaler()),
               ('Model', SVR()),
             ])
               

rf = Pipeline([('missing_vals', missing_values()),
               ('drop_similar', DropSimilarOHEFeatures()),
               ('drop_redundant', DropSimilarOHEFeatures()),
               ('me', MeanEncoder()),
               ('Min_Max', MinMaxScaler()),
               ('Model', RandomForestRegressor()),
             ])


# xgb = Pipeline([('missing_vals', missing_values()),
#                ('drop_similar', DropSimilarOHEFeatures()),
#                ('drop_redundant', DropSimilarOHEFeatures()),
#                ('me', MeanEncoder()),
#                ('Min_Max', MinMaxScaler()),
#                ('pca', PCA(n_components = 0.95)),
#                ('Model', RandomForestRegressor()),
#              ])

In [None]:
pipelines = [lr, svr, rf]

In [None]:
# mv = missing_values()
# ds = DropSimilarOHEFeatures()
# dr = DropSimilarOHEFeatures()
# me = MeanEncoder()

In [None]:
# X_train = mv.fit_transform(X_train, y_train)
# X_test = mv.transform(X_test, y_test)
# X_train.isnull().sum(), X_test.isnull().sum()

In [None]:
# X_train = ds.fit_transform(X_train, y_train)
# X_test = ds.transform(X_test, y_test)
# X_train.isnull().sum(), X_test.isnull().sum()

In [None]:
# X_train = dr.fit_transform(X_train, y_train)
# X_test = dr.transform(X_test, y_test)
# X_train.isnull().sum(), X_test.isnull().sum()

In [None]:
# me = MeanEncoder()
# X_train = me.fit_transform(X_train, y_train)
# X_test = me.transform(X_test, y_test)
# X_train.isnull().sum(), X_test.isnull().sum()

In [None]:
# bb = LinearRegression()
# bb.fit(X_train, y_train)
# bb.score(X_test, y_test)

In [None]:
train_df = pd.read_csv(train_df_path)

X_train = train_df.loc[:, train_df.columns != 'Price']
y_train = np.log(train_df['Price'])

test_df = pd.read_csv(test_df_path)

X_test = test_df.loc[:, test_df.columns != 'Price']
y_test = np.log(test_df['Price'])


In [None]:
# me = MeanEncoder()
# X_train = me.fit_transform(X_train, y_train)
# X_test = me.transform(X_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
for pipe in pipelines:
  pipe.fit(X_train, y_train)
  print(pipe.score(X_test, y_test))

mv
Unnamed: 0                     0
Brand                          0
Pack Size                      0
Water Source                   0
Container                      0
100% Natural                   0
Anti-Oxidant                   0
Artesian                       0
BPA Free Plastic               0
Caffeinated                    0
Carbonated                     0
Electrolytes                   0
Energy Drinks                  0
Enhanced                       0
Ethically Sourced              0
Fair Trade                     0
Flavored                       0
Gluten FREE                    0
Green Tea                      0
Kosher                         0
Low Calorie                    0
Mineral Water                  0
No Artifical Flavors           0
No Artifical Preservatives     0
No Artifical Sweetners         0
Non-Alcoholic Wine             0
Non GMO                        0
Non-Sparkling Water            0
Nutritional                    0
Organic                        0
Organic

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


dsf
dsf
me
mv
Unnamed: 0                     0
Brand                          0
Pack Size                      0
Water Source                   0
Container                      0
100% Natural                   0
Anti-Oxidant                   0
Artesian                       0
BPA Free Plastic               0
Caffeinated                    0
Carbonated                     0
Electrolytes                   0
Energy Drinks                  0
Enhanced                       0
Ethically Sourced              0
Fair Trade                     0
Flavored                       0
Gluten FREE                    0
Green Tea                      0
Kosher                         0
Low Calorie                    0
Mineral Water                  0
No Artifical Flavors           0
No Artifical Preservatives     0
No Artifical Sweetners         0
Non-Alcoholic Wine             0
Non GMO                        0
Non-Sparkling Water            0
Nutritional                    0
Organic                      

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


dsf
dsf
me
mv
Unnamed: 0                     0
Brand                          0
Pack Size                      0
Water Source                   0
Container                      0
100% Natural                   0
Anti-Oxidant                   0
Artesian                       0
BPA Free Plastic               0
Caffeinated                    0
Carbonated                     0
Electrolytes                   0
Energy Drinks                  0
Enhanced                       0
Ethically Sourced              0
Fair Trade                     0
Flavored                       0
Gluten FREE                    0
Green Tea                      0
Kosher                         0
Low Calorie                    0
Mineral Water                  0
No Artifical Flavors           0
No Artifical Preservatives     0
No Artifical Sweetners         0
Non-Alcoholic Wine             0
Non GMO                        0
Non-Sparkling Water            0
Nutritional                    0
Organic                      

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


dsf
dsf
me
mv
Unnamed: 0                     0
Brand                          0
Pack Size                      0
Water Source                   0
Container                      0
100% Natural                   0
Anti-Oxidant                   0
Artesian                       0
BPA Free Plastic               0
Caffeinated                    0
Carbonated                     0
Electrolytes                   0
Energy Drinks                  0
Enhanced                       0
Ethically Sourced              0
Fair Trade                     0
Flavored                       0
Gluten FREE                    0
Green Tea                      0
Kosher                         0
Low Calorie                    0
Mineral Water                  0
No Artifical Flavors           0
No Artifical Preservatives     0
No Artifical Sweetners         0
Non-Alcoholic Wine             0
Non GMO                        0
Non-Sparkling Water            0
Nutritional                    0
Organic                      

In [None]:
params = {}
rf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'missing_vals', 'drop_similar', 'drop_redundant', 'me', 'Min_Max', 'Model', 'drop_similar__threshold', 'drop_redundant__threshold', 'Min_Max__copy', 'Min_Max__feature_range', 'Model__bootstrap', 'Model__ccp_alpha', 'Model__criterion', 'Model__max_depth', 'Model__max_features', 'Model__max_leaf_nodes', 'Model__max_samples', 'Model__min_impurity_decrease', 'Model__min_impurity_split', 'Model__min_samples_leaf', 'Model__min_samples_split', 'Model__min_weight_fraction_leaf', 'Model__n_estimators', 'Model__n_jobs', 'Model__oob_score', 'Model__random_state', 'Model__verbose', 'Model__warm_start'])

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [2,4]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
params = {}

In [None]:
# params['drop_similar__threshold'] = [0.8,0.85,0.9]
# params['drop_redundant__threshold'] = [0.8,0.85,0.9]
params['n_estimators'] = n_estimators
params['max_features'] = max_features
params['min_samples_split'] = min_samples_split
params['min_samples_leaf'] = min_samples_leaf
params['bootstrap'] = bootstrap

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
rf_reg = RandomForestRegressor()

In [None]:
rf_reg

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
pip = Pipeline([('missing_vals', missing_values()),
               ('drop_similar', DropSimilarOHEFeatures()),
               ('drop_redundant', DropSimilarOHEFeatures()),
               ('me', MeanEncoder()),
               ('Min_Max', MinMaxScaler()),
          ])

In [None]:
rf_Grid = GridSearchCV(estimator = rf_reg, param_grid = params, cv = 3, verbose=2, n_jobs = -1)

In [None]:
X_train = pip.fit_transform(X_train, y_train)

mv
Unnamed: 0                     0
Brand                          0
Pack Size                      0
Water Source                   0
Container                      0
100% Natural                   0
Anti-Oxidant                   0
Artesian                       0
BPA Free Plastic               0
Caffeinated                    0
Carbonated                     0
Electrolytes                   0
Energy Drinks                  0
Enhanced                       0
Ethically Sourced              0
Fair Trade                     0
Flavored                       0
Gluten FREE                    0
Green Tea                      0
Kosher                         0
Low Calorie                    0
Mineral Water                  0
No Artifical Flavors           0
No Artifical Preservatives     0
No Artifical Sweetners         0
Non-Alcoholic Wine             0
Non GMO                        0
Non-Sparkling Water            0
Nutritional                    0
Organic                        0
Organic

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


dsf
dsf
me


In [None]:
rf_Grid.fit(X_train, y_train)

Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:   39.5s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [None]:
rf_Grid.best_params_

{'bootstrap': True,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 64}

In [None]:

final_pipeline = Pipeline([('missing_vals', missing_values()),
                            ('drop_similar', DropSimilarOHEFeatures()),
                            ('drop_redundant', DropSimilarOHEFeatures()),
                            ('me', MeanEncoder()),
                            ('Min_Max', MinMaxScaler()),
                            ('Model', RandomForestRegressor(bootstrap=True, max_features='auto',min_samples_leaf=1,min_samples_split = 2, n_estimators=64)),
                          ])

In [None]:
train_df = pd.read_csv(train_df_path)

X_train = train_df.loc[:, train_df.columns != 'Price']
y_train = np.log(train_df['Price'])

test_df = pd.read_csv(test_df_path)

X_test = test_df.loc[:, test_df.columns != 'Price']
y_test = np.log(test_df['Price'])


In [None]:
final_pipeline.fit(X_train, y_train)

mv
Unnamed: 0                     0
Brand                          0
Pack Size                      0
Water Source                   0
Container                      0
100% Natural                   0
Anti-Oxidant                   0
Artesian                       0
BPA Free Plastic               0
Caffeinated                    0
Carbonated                     0
Electrolytes                   0
Energy Drinks                  0
Enhanced                       0
Ethically Sourced              0
Fair Trade                     0
Flavored                       0
Gluten FREE                    0
Green Tea                      0
Kosher                         0
Low Calorie                    0
Mineral Water                  0
No Artifical Flavors           0
No Artifical Preservatives     0
No Artifical Sweetners         0
Non-Alcoholic Wine             0
Non GMO                        0
Non-Sparkling Water            0
Nutritional                    0
Organic                        0
Organic

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


dsf
dsf
me


Pipeline(memory=None,
         steps=[('missing_vals', missing_values()),
                ('drop_similar', DropSimilarOHEFeatures(threshold=0.85)),
                ('drop_redundant', DropSimilarOHEFeatures(threshold=0.85)),
                ('me', MeanEncoder()),
                ('Min_Max', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('Model',
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=64, n_jobs=None,
       

In [None]:
final_pipeline.score(X_test, y_test)

mv
Unnamed: 0                     0
Brand                          0
Pack Size                      0
Water Source                   0
Container                      0
100% Natural                   0
Anti-Oxidant                   0
Artesian                       0
BPA Free Plastic               0
Caffeinated                    0
Carbonated                     0
Electrolytes                   0
Energy Drinks                  0
Enhanced                       0
Ethically Sourced              0
Fair Trade                     0
Flavored                       0
Gluten FREE                    0
Green Tea                      0
Kosher                         0
Low Calorie                    0
Mineral Water                  0
No Artifical Flavors           0
No Artifical Preservatives     0
No Artifical Sweetners         0
Non-Alcoholic Wine             0
Non GMO                        0
Non-Sparkling Water            0
Nutritional                    0
Organic                        0
Organic

0.9654210353617324