# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# config setup for fast experiment
from src.config import Config

# some utilities
from src.utils import show_samples, process_im, save_model

# boosting lib 
import catboost 

import warnings
warnings.filterwarnings(action='ignore')

# Set up experiment pipeline

In [2]:
seed_val = 2021
np.random.seed(seed = seed_val)

# Loading data

In [3]:
# Split the data
train_sampled = pd.read_csv(os.path.join(Config.data_dir, 'train_sampled.csv'))
X, y  = train_sampled[train_sampled.columns[:-2]], train_sampled['Yield']
X_train, X_val, y_train, y_val = train_test_split(X, y)

# Modeling

In [4]:
model = catboost.CatBoostRegressor(
    learning_rate = Config.lr,
    random_seed=seed_val,
    
)

# Fit the model


In [5]:
model = model.fit(
    X=X_train,
    y=y_train,
    eval_set=[(X_val, y_val)], 
    # eval_metric='rmse',
    early_stopping_rounds=300
)

0:	learn: 1.7613043	test: 1.6678065	best: 1.6678065 (0)	total: 96.6ms	remaining: 1m 36s
1:	learn: 1.7580927	test: 1.6653135	best: 1.6653135 (1)	total: 143ms	remaining: 1m 11s
2:	learn: 1.7548337	test: 1.6630425	best: 1.6630425 (2)	total: 158ms	remaining: 52.5s
3:	learn: 1.7519330	test: 1.6606273	best: 1.6606273 (3)	total: 170ms	remaining: 42.3s
4:	learn: 1.7487523	test: 1.6583572	best: 1.6583572 (4)	total: 176ms	remaining: 35s
5:	learn: 1.7455520	test: 1.6558969	best: 1.6558969 (5)	total: 183ms	remaining: 30.3s
6:	learn: 1.7423634	test: 1.6538020	best: 1.6538020 (6)	total: 200ms	remaining: 28.3s
7:	learn: 1.7395222	test: 1.6515738	best: 1.6515738 (7)	total: 209ms	remaining: 25.9s
8:	learn: 1.7364714	test: 1.6495672	best: 1.6495672 (8)	total: 218ms	remaining: 24s
9:	learn: 1.7332484	test: 1.6471990	best: 1.6471990 (9)	total: 232ms	remaining: 22.9s
10:	learn: 1.7304020	test: 1.6450985	best: 1.6450985 (10)	total: 240ms	remaining: 21.6s
11:	learn: 1.7275716	test: 1.6427629	best: 1.6427629 

In [6]:
print('Score:', mean_squared_error(y_val, model.predict(X_val), squared=False))

Score: 1.524083843014151


In [10]:
model_path = save_model(
    model_name=os.path.join(Config.models_dir, 'yield_predictor.cbm'),
    model = model
)

# Make predictions

In [11]:
# Load the sample submission file
ss = pd.read_csv(os.path.join(Config.data_dir, 'SampleSubmission.csv'))

# Prep the data, using the same method we did for train
test_sampled = pd.read_csv(os.path.join(Config.data_dir, 'test_sampled.csv'))

trained_model = model.load_model(
    fname=model_path 
)
# Get model predictions
preds = trained_model.predict(test_sampled)


# Store them in the submission dataframe and save
ss['Yield'] = preds
filename = f'{Config.base_model}_lr_{Config.lr}.csv'
ss.to_csv(os.path.join(Config.submissions_dir, filename), index=False)
ss.head()

Unnamed: 0,Field_ID,Yield
0,E9UZCEA,3.316812
1,1WGGS1Q,3.352659
2,EG2KXE2,3.517634
3,HC3GQXF,3.387906
4,7AK6GFK,3.325712


In [12]:
ss.shape

(1055, 2)