In [23]:
import numpy as np
import pandas as pd

In [25]:
df = pd.read_csv("tips.csv")

In [27]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [29]:
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier 
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2

In [33]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['size']),
                                                df['size'],
                                                test_size=0.2,
                                                random_state=42)

In [35]:
X_train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time
228,13.28,2.72,Male,No,Sat,Dinner
208,24.27,2.03,Male,Yes,Sat,Dinner
96,27.28,4.0,Male,Yes,Fri,Dinner
167,31.71,4.5,Male,No,Sun,Dinner
84,15.98,2.03,Male,No,Thur,Lunch


In [37]:
df.isnull().sum()

total_bill     0
tip           10
sex            0
smoker         0
day            0
time          14
size           0
dtype: int64

In [39]:
trf1 = ColumnTransformer([
    ('impute_tip',SimpleImputer(),[1]),
    ('impute_time',SimpleImputer(strategy='most_frequent'),[5]),
],remainder='passthrough')

In [41]:
trf2 = ColumnTransformer([
    ('ohe_tip_time',OneHotEncoder(sparse=False,handle_unknown='ignore'),[0,5])
],remainder='passthrough')

In [43]:
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

In [47]:
trf4 = SelectKBest(score_func=chi2,k=6)

In [49]:
trf5 = DecisionTreeClassifier()

### Now we will create Pipeline

In [55]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [57]:
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [59]:
pipe.fit(X_train,y_train)



In [61]:
from sklearn import set_config
set_config(display='diagram')

In [63]:
y_pred = pipe.predict(X_test)

In [65]:
y_pred

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2], dtype=int64)

### Now we will find the accuracy of our program

In [68]:
from sklearn.metrics import accuracy_score

In [70]:
accuracy_score(y_test,y_pred) * 100

61.224489795918366

In [74]:
# to find the best of accuracy score, we will test it five times and then we will consider the average of our accuracy score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean() * 100



63.5897435897436

### for depoloying it to some website or something we will use pickle library of scikit learn

In [82]:
import pickle
pickle.dump(pipe,open('projectpipline.pkl','wb'))