In [90]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

#### Prepare Data

In [91]:
data = pd.read_csv('Salary_dataset.csv')
# separate the dependent and the independent
X_data = data[['YearsExperience']]
y_data = data[['Salary']]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test =  train_test_split(X_test,  y_test, train_size=0.5, random_state=42)

#### Create pipeline

In [92]:
imputer = SimpleImputer(strategy='mean')
lr = LinearRegression()
pipeline1 = make_pipeline(imputer, lr)

#### Train model using pipeline

In [93]:
pipeline1.fit(X_train,y_train)

In [94]:
pipeline1.score(X_train, y_train)

0.9591684852038581

In [95]:
pipeline1.score(X_val, y_val)

0.9344022944530951

In [96]:
pipeline1.named_steps.simpleimputer.statistics_

array([5.27222222])

In [97]:
pipeline1.named_steps.linearregression.coef_

array([[9284.87835182]])

#### More advanced pipeline

In [98]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import  ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

#### Import data

In [99]:
car_data = pd.read_csv('cardekho_data.csv')
print(car_data.shape)
car_data.head()

(301, 9)


Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [100]:
bin_labels = ['Cheap', 'Average', 'High', 'Expensive']
car_data['Valuation'] = pd.cut(car_data['Present_Price'], bins=len(bin_labels), labels=bin_labels)
car_data['Valuation'] = [bin_labels.index(value) for value in car_data['Valuation']]
car_data.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Valuation
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0,0


In [101]:
X_data = car_data.iloc[:,[0,1,4,5]]
y_data = car_data.iloc[:,-1]
y_data.head()

0    0
1    0
2    0
3    0
4    0
Name: Valuation, dtype: int64

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test =  train_test_split(X_test,  y_test, train_size=0.5, random_state=42)

In [103]:
numerical_cols = ['Kms_Driven']
categorical_cols = ['Car_Name','Year','Fuel_Type']

In [104]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

In [105]:
category_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [106]:
# define column transformer
column_transformer =  ColumnTransformer(transformers=[
    ('numerical_pipeline', numerical_pipeline, numerical_cols),
    ('category_pipeline', category_pipeline, categorical_cols)
],
    remainder='drop',
    n_jobs=-1
)

In [107]:
dtc_classifier = DecisionTreeClassifier()
dtc_pipeline = make_pipeline(column_transformer, dtc_classifier)
dtc_pipeline.fit(X_train, y_train)

In [108]:
dtc_pipeline.score(X_test, y_test)

0.9672131147540983

In [109]:
dtc_pipeline.score(X_val, y_val)

0.9833333333333333

In [110]:
import joblib
joblib.dump(dtc_pipeline, 'pipe.joblib')

['pipe.joblib']

In [115]:
# load pipeline
pipeline2 = joblib.load('pipe.joblib')
pipeline2.predict(X_val)
print(X_val)

                      Car_Name  Year  Kms_Driven Fuel_Type
73                  etios liva  2011       43000    Petrol
101         UM Renegade Mojave  2017        1400    Petrol
298                       city  2009       87934    Petrol
125  Royal Enfield Classic 500  2009       40000    Petrol
164       Hero Splender iSmart  2016       14000    Petrol
173                  Activa 4g  2017        1300    Petrol
203                        i10  2011       53460    Petrol
198         Bajaj Discover 125  2011       35000    Petrol
97               corolla altis  2017        8700    Petrol
24                     wagon r  2013       56879    Petrol
15                      ertiga  2016       43000    Diesel
118  Royal Enfield Classic 350  2015       26000    Petrol
232                    elantra  2015       12900    Petrol
225                        i10  2011       22517    Petrol
196                  Activa 3g  2008      500000    Petrol
223                      verna  2015       61381    Dies

In [114]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import numpy as np

# Create a logistic regression model
logreg = LogisticRegression()

# Create a pipeline with just the logistic regression model
pipeline = Pipeline([
    ('logreg', logreg)
])

# Sample input data
input_data = np.random.rand(100, 10)  # 100 samples, 10 features

# Batch size for inference
batch_size = 20

# Batch inference
output_data = []
for i in range(0, len(input_data), batch_size):
    batch = input_data[i:i+batch_size]
    output_batch = pipeline.predict_proba(batch)  # Get probabilities for each class
    output_data.append(output_batch)

output_data = np.concatenate(output_data, axis=0)

print(output_data.shape) 

NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.