# Chicago taxi tips Regression Pipeline local

In [1]:
from sklearn.model_selection import train_test_split

#### Create datasets compatible with xgboost model

In [2]:
def create_sets(data,label_column):
    ### get label
    label=data[label_column]
    ### get features
    data=data.drop([label_column],axis=1)
    ### split
    X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=100)
    ### group per train, test
    dtrain = xgboost.DMatrix(X_train, label=y_train)
    dtest = xgboost.DMatrix(X_test, label=y_test)
    return dtrain, dtest 
    

#### Create callback to use xgboost with tensorboard

In [3]:
!pip install tensorboardX



In [4]:
from tensorboardX import SummaryWriter

In [5]:
def TensorBoardCallback(training_log_path):
    writer = SummaryWriter(training_log_path)

    def callback(env):
        for k, v in env.evaluation_result_list:
            print(k,v)
            writer.add_scalar(k, v, env.iteration)

    return callback

#### Create the main train function

In [6]:
!pip install xgboost==1.1.0



In [7]:
import xgboost

In [8]:
def xgboost_train(data,label,num_iterations,training_log_path,booster_params):
    
    ### create train,test objects 
    dtrain, dtest = create_sets(data,label)
    
    # Booster parameters with default values
    booster_params = booster_params or {}
    booster_params.setdefault('objective', 'reg:squarederror')
    booster_params.setdefault('booster', 'gbtree')
    booster_params.setdefault('learning_rate', 0.3)
    booster_params.setdefault('min_split_loss', 0)
    booster_params.setdefault('max_depth', 6)
    
    num_iterations = num_iterations or 20
    
    ### create blank model
    model = xgboost.train(
        params=booster_params,
        dtrain=dtrain,
        num_boost_round=num_iterations,
        evals=[(dtrain, 'train'), (dtest, 'test')],
       callbacks=[TensorBoardCallback(training_log_path)]
    )
    return model

### Test the training

##### Create minio client to get data

In [9]:
!pip install pyarrow



In [10]:
from minio import Minio
import urllib3
from io import BytesIO
import pandas as pd
import pyarrow
import datetime
import os

In [11]:
## Create a client with the access key and the secret key given
client = Minio(
    "storage-api.course.aiengineer.codex-platform.com",
    access_key=os.getenv("MINIO-ACCESS-KEY"),
    secret_key=os.getenv("MINIO-SECRET-KEY"),
    secure=True,
    http_client=urllib3.PoolManager(
        
        retries=urllib3.Retry(
            total=5,
            backoff_factor=0.2,
            status_forcelist=[500, 502, 503, 504],
        ),
    ),
)

In [12]:
bucket_name = ''#firstname-name
object_name = 'datasets/chicago/trips.parquet'

In [13]:
# Get data from minio using get_object, decode it using BytesIO and read the parquet result with pandas
try:
    response = client.get_object(bucket_name, object_name)
    # Read data from response.
    parquet_object=BytesIO(response.data)
    data = pd.read_parquet(parquet_object)
finally:
    response.close()
    response.release_conn()

In [14]:
data.head()

Unnamed: 0,tips,trip_start_timestamp,trip_seconds,trip_miles,pickup_community_area,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_community_area,fare,tolls,extras,trip_total
0,0.0,2023-01-31T23:45:00.000,1719,15.1,76.0,41.980264,-87.913625,7.0,39.25,0.0,5.0,44.25
1,0.0,2023-01-31T23:45:00.000,1339,18.9,76.0,41.980264,-87.913625,33.0,46.75,0.0,4.0,50.75
2,0.0,2023-01-31T23:45:00.000,306,0.69,8.0,41.899602,-87.633308,8.0,5.5,0.0,0.0,5.5
3,0.0,2023-01-31T23:45:00.000,3664,13.94,76.0,41.980264,-87.913625,76.0,41.25,0.0,6.5,47.75
4,5.12,2023-01-31T23:45:00.000,5,0.0,,,,,20.0,0.0,0.0,25.62


In [15]:
data = data.drop(["trip_start_timestamp"],axis=1).dropna()

In [19]:
xgboost_train(
    data=data,
    label='tips',
    num_iterations=20,    
    training_log_path=f"training/fit/taxi-trips/{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}",
    booster_params={}
)

train-rmse 2.669143
test-rmse 2.652077
[0]	train-rmse:2.66914	test-rmse:2.65208
train-rmse 2.071043
test-rmse 2.124333
[1]	train-rmse:2.07104	test-rmse:2.12433
train-rmse 1.664521
test-rmse 1.784089
[2]	train-rmse:1.66452	test-rmse:1.78409
train-rmse 1.356828
test-rmse 1.537212
[3]	train-rmse:1.35683	test-rmse:1.53721
train-rmse 1.107906
test-rmse 1.345601
[4]	train-rmse:1.10791	test-rmse:1.34560
train-rmse 0.957096
test-rmse 1.242852
[5]	train-rmse:0.95710	test-rmse:1.24285
train-rmse 0.834758
test-rmse 1.167556
[6]	train-rmse:0.83476	test-rmse:1.16756
train-rmse 0.7412
test-rmse 1.093985
[7]	train-rmse:0.74120	test-rmse:1.09398
train-rmse 0.670638
test-rmse 1.056987
[8]	train-rmse:0.67064	test-rmse:1.05699
train-rmse 0.61144
test-rmse 1.027255
[9]	train-rmse:0.61144	test-rmse:1.02726
train-rmse 0.558424
test-rmse 1.007237
[10]	train-rmse:0.55842	test-rmse:1.00724
train-rmse 0.519311
test-rmse 0.982687
[11]	train-rmse:0.51931	test-rmse:0.98269
train-rmse 0.487045
test-rmse 0.972495
[1

<xgboost.core.Booster at 0x7f65b85a8e50>

### Now connect to your tensorboard and check the loss

1. Go to the Kubeflow interface and click on 'tensorboard'

![menu_tenso](./images/menu_tenso.png)

2. click on 'new' 

![new](./images/new.png)

3. link it to your lab, where you persist your training logs

![board](./images/board.png)

4. click `connect` and you should be able to see your train metrics

![tenso](./images/tenso.png)