In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import FeatureUnion, Pipeline 
from sklearn.model_selection import train_test_split
%matplotlib inline
from IPython.core.pylabtools import figsize
#figsize(15,20)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
df_train=pd.read_excel('./dataset/Data_Train.xlsx')

In [3]:
df_test=pd.read_excel('./dataset/Data_Test.xlsx')

In [4]:
df_train.head()
df_test.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0,
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0,
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0,25.27 Lakh
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0,
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0,


In [5]:
to_drop = ['Name','New_Price','Location']

In [6]:
y=df_train['Price']

In [7]:
df_train = df_train.drop('Price',axis=1)

In [8]:
#Custom Transformer that extracts columns passed as argument to its constructor 
# https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, to_drop_columns ):
        self._to_drop_columns = to_drop_columns
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        Xt = X.drop(self._to_drop_columns,axis=1)
        return Xt

In [55]:
class NumPipeline( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        pass
    
    def extract_numeric_vals(self,X):
        X['Mileage']= pd.to_numeric(X['Mileage'].str.extract(r'^(\d*)', expand=False)) 
        X['Engine']=pd.to_numeric(X['Engine'].str.extract(r'^(\d*)', expand=False))
        X['Power']=pd.to_numeric(X['Power'].str.extract(r'^(\d*)', expand=False))
        return X
    
    def inpute_engine(self,cols):
        Engine=cols[0]
        Fuel_Type=cols[1]
        if pd.isnull(Engine):
            if Fuel_Type == 'CNG':
                return 998
            elif Fuel_Type == 'Diesel':
                return 1598
            elif Fuel_Type == 'Petrol':
                return 1198
            elif Fuel_Type == 'LPG':
                return 1061
            else:
                return 935
        else:
            return Engine
        
    def inpute_power(self,cols):
        Power=cols[0]
        Fuel_Type=cols[1]
        if pd.isnull(Power):
            if Fuel_Type == 'CNG':
                return 58
            elif Fuel_Type == 'Diesel':
                return 108
            elif Fuel_Type == 'Petrol':
                return 83
            elif Fuel_Type == 'LPG':
                return 58
            else:
                return 57
        else:
            return Power
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        X=self.extract_numeric_vals(X)
        X['Power'] = X[['Power','Fuel_Type']].apply(self.inpute_power,axis=1)
        X['Engine'] = X[['Engine','Fuel_Type']].apply(self.inpute_engine,axis=1)
        X['Seats'] = X['Seats'].replace(0.0,5.0,inplace=True)
        X['Seats'].fillna(5,inplace=True)
        X.fillna(X.mean(),inplace=True)
        print(X.head(2))
        return X

In [56]:
class CatPipeline( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self ):
        pass
    
   
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        owner_type=pd.get_dummies(X['Owner_Type'],drop_first=True)
        fuel_type=pd.get_dummies(X['Fuel_Type'],drop_first=True)
        transmission_type=pd.get_dummies(X['Transmission'],drop_first=True)
        Xt=pd.concat([X,fuel_type,owner_type,transmission_type],axis=1)
        Xt = Xt.drop(['Fuel_Type','Owner_Type','Transmission'],axis=1)
        return Xt

In [57]:
initial_pipeline = Pipeline( steps = [ ( 'feature_selector', FeatureSelector(to_drop_columns=to_drop)) ])
cat_pipeline = Pipeline( steps = [ ( 'cat_pipeline', CatPipeline()) ])
num_pipeline = Pipeline( steps = [ ( 'num_pipeline', NumPipeline()) ])

In [62]:
full_pipeline = FeatureUnion( transformer_list = [ ( 'feature_selector', initial_pipeline ),
                                                  ( 'num_pipeline', num_pipeline ),
                                                   ( 'cat_pipeline', cat_pipeline )] )


In [63]:
X_train, X_test, y_train, y_test = train_test_split( df_train, y , test_size = 0.2 , random_state = 42 )

In [64]:
full_pipeline_m = Pipeline( steps = [ ( 'full_pipeline', full_pipeline), ( 'model', RandomForestRegressor() ) ] )

In [65]:
full_pipeline_m.fit( X_train, y_train )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.ht

                                 Name    Location  Year  Kilometers_Driven  \
4248                Chevrolet Beat LS       Delhi  2011              76155   
4129  Ford EcoSport 1.5 TDCi Titanium  Coimbatore  2014              64637   

     Fuel_Type Transmission Owner_Type  Mileage  Engine  Power  Seats  \
4248    Petrol       Manual      First     18.0  1199.0   79.0      5   
4129    Diesel       Manual      First     22.0  1498.0   98.0      5   

     New_Price  
4248       NaN  
4129       NaN  


ValueError: could not convert string to float: 'Petrol'