XGBoost (eXtreme Gradient Boosting) is an open-source software library for gradient boosting on decision trees. It is designed to be efficient and scalable, and has been widely used in machine learning competitions and in industry. XGBoost is known for its ability to handle missing values and categorical variables, and for its fast training speed and good performance on a variety of tasks. It also has built-in support for parallel processing, which can further speed up training.

In [1]:
pip install xgboost
pip install -U scikit-learn

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
import warnings 
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from plotnine import *
import statsmodels.api as sm

from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression # Linear Regression Model
from sklearn.preprocessing import StandardScaler #Z-score variables
from sklearn.metrics import mean_squared_error, r2_score #model evaluation

from sklearn.model_selection import KFold # k-fold cv
from sklearn.model_selection import LeaveOneOut #LOO cv

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb
from sklearn.metrics import accuracy_score

%matplotlib inline


In [4]:
#Import the data

trainData = pd.read_csv('/Users/elliestrande/Desktop/WiDS/widsdatathon2023/train_data.csv')
testData = pd.read_csv('/Users/elliestrande/Desktop/WiDS/widsdatathon2023/test_data.csv')


In [5]:
#Drop missing values

trainData = trainData.dropna()
trainData.reset_index()
trainData

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,0,0.0,0.833333,9/1/14,237.00,29.02,31.64,29.57,30.73,29.71,...,-27.68,-37.21,8.32,9.56,-2.03,48.13,28.09,-13.50,11.90,4.58
1,1,0.0,0.833333,9/2/14,228.90,29.02,31.64,29.57,30.73,29.71,...,-21.13,-36.57,8.77,21.17,4.44,48.60,27.41,-23.77,15.44,3.42
2,2,0.0,0.833333,9/3/14,220.69,29.02,31.64,29.57,30.73,29.71,...,-10.72,-34.16,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82
3,3,0.0,0.833333,9/4/14,225.28,29.02,31.64,29.57,30.73,29.71,...,0.33,-31.04,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74
4,4,0.0,0.833333,9/5/14,237.24,29.02,31.64,29.57,30.73,29.71,...,9.83,-31.80,7.47,38.62,-5.21,54.73,-2.58,-42.30,21.91,10.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375729,375729,1.0,0.866667,8/27/16,312.05,23.13,27.20,20.25,24.43,18.35,...,-15.64,-75.68,-3.09,6.93,-16.69,16.98,-13.85,50.25,-31.33,0.77
375730,375730,1.0,0.866667,8/28/16,305.82,23.13,27.20,20.25,24.43,18.35,...,-7.59,-76.42,-13.55,13.36,-15.96,20.45,-16.36,51.65,-30.73,10.10
375731,375731,1.0,0.866667,8/29/16,311.62,23.13,27.20,20.25,24.43,18.35,...,-6.25,-70.65,-23.93,22.62,-16.71,20.28,-15.48,48.58,-18.74,9.28
375732,375732,1.0,0.866667,8/30/16,304.54,23.13,27.20,20.25,24.43,18.35,...,-7.16,-57.67,-33.55,32.06,-16.07,16.60,-20.61,39.23,-16.26,-0.22


In [6]:
#Find which variables are strings/continuous (objects)

typesDF = pd.DataFrame(trainData.dtypes)

typesDF


Unnamed: 0,0
index,int64
lat,float64
lon,float64
startdate,object
contest-pevpr-sfc-gauss-14d__pevpr,float64
...,...
wind-vwnd-925-2010-16,float64
wind-vwnd-925-2010-17,float64
wind-vwnd-925-2010-18,float64
wind-vwnd-925-2010-19,float64


In [7]:
#Create list of predictor variables
preds = list(trainData)

preds.remove('index')
preds.remove('lat')
preds.remove('startdate')
preds.remove('climateregions__climateregion')
preds.remove('mjo1d__phase')
preds.remove('mei__meirank')
preds.remove('mei__nip')
preds.remove('contest-tmp2m-14d__tmp2m')

X = trainData[preds]
y = trainData['contest-tmp2m-14d__tmp2m']


## **LINEAR REGRESSION**

In [6]:
#Simple linear 

#Train test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2,
                                                  random_state = 5)

In [7]:
#Standardize/Z score

# z = StandardScaler()
# X_train[preds] = z.fit_transform(X_train[preds])
# X_val[preds] = z.transform(X_val[preds])

In [8]:
#Create and fit model
lr = LinearRegression()
lr.fit(X_train, y_train)

In [18]:
#Predictions

y_pred = lr.predict(X_val)

In [19]:
#Training MSE
print('Train MSE: ', mean_squared_error(y_train, lr.predict(X_train)))

#Testing MSE
print('Test MSE: ', mean_squared_error(y_val, y_pred))

Train MSE:  1.756318935628493
Test MSE:  1.7494598023768528


In [20]:
#Training R2
print('Train R2: ', lr.score(X_train, y_train))

#Testing R2
print('Test R2: ', lr.score(X_val, y_val))

Train R2:  0.9822431134914706
Test R2:  0.9823112515571618


In [None]:
testData['index']

0        375734
1        375735
2        375736
3        375737
4        375738
          ...  
31349    407083
31350    407084
31351    407085
31352    407086
31353    407087
Name: index, Length: 31354, dtype: int64

In [None]:
#Predict on the test data

finalPreds = lr.predict(testData[preds])

In [None]:
#Create dataframe to store the results
results = pd.DataFrame(finalPreds, columns = ['contest-tmp2m-14d__tmp2m'])

results['index'] = testData['index']

results

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,27.727076,375734
1,27.635880,375735
2,27.390594,375736
3,27.265669,375737
4,27.212049,375738
...,...,...
31349,6.334028,407083
31350,6.304296,407084
31351,5.244093,407085
31352,5.685833,407086


In [None]:
results.to_csv('solution.csv', index = False)

## **XGBOOST**

In [8]:
# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [9]:
from xgboost.sklearn import XGBRegressor
#XGBoost hyperparameter tuning

#Split the data into seperate training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,
                                                  random_state = 5)

In [10]:
#Hyperparameter tuning

#Initialize domain space for range of values
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0,
    }

#Define objective function
def objective(space):
    clf=xgb.XGBRegressor(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = r2_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()

#Optimize 
best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 50,
                        trials = trials)

print("The best hyperparameters are : " , "\n")
print(best_hyperparams)

  0%|          | 0/50 [00:12<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [53]:
#Define the xgboost regression model
model = XGBRegressor(n_estimators = 100, max_depth = 14, learning_rate = 0.3,
                    colsample_bytree = 0.9423125131065853, gamma = 2.7106590852866628,
                    min_child_weight = 3, reg_alpha = 42, reg_lambda = 0.5805879363282948, eta = 0.3,
                    subsample = 0.8)

#Train the model
model.fit(X_train, y_train)


#Predictions
y_pred = model.predict(X_test)

In [59]:
def print_results_gridsearch(gridsearch, list_param1, list_param2, name_param1, name_param2):
  
  # Checking the results from each run in the gridsearch: 
  means = gridsearch.cv_results_['mean_test_score']
  stds = gridsearch.cv_results_['std_test_score']
  params = gridsearch.cv_results_['params']
  print("The results from each run in the gridsearch:")
  for mean, stdev, param in zip(means, stds, params):
    print("roc_auc = %f (%f) with: %r" % (mean, stdev, param))
  
  #Visualizing the results from each run in the gridsearch: 
  scores = np.array(means).reshape(len(list_param1), len(list_param2))
  for i, value in enumerate(list_param1):
    plt.plot(list_param2, scores[i], label= str(name_param1) + ': ' + str(value))
    plt.legend()
    plt.xlabel(str(name_param2))
    plt.ylabel('ROC AUC')  
    plt.show()
    
  # Checking the best performing model:
  print("\n")
  print("Best model: roc_auc = %f using %s" % (gridsearch.best_score_, gridsearch.best_params_))

In [73]:
#Best model after hyperparameter tuning

#Define the xgboost regression model
model = XGBRegressor(n_estimators = 1000, max_depth = 10, learning_rate = 0.2,
                    colsample_bytree = 0.8, gamma = 0.15,
                    min_child_weight = 2, reg_alpha = 42, reg_lambda = 0.5805879363282948, eta = 0.3,
                    subsample = 0.8, scale_pos_weight = 1, nthread = 4, seed = 27)

#Train the model
model.fit(X_train, y_train)


#Predictions
y_pred = model.predict(X_test)

In [19]:
#Training MSE
print(mean_squared_error(y_train, model.predict(X_train)))

#Testing MSE
print(mean_squared_error(y_test, y_pred))

0.08346232486625865
0.18834333412149212


In [75]:
testData.shape

(31354, 245)

In [20]:
#Predict on the test data

finalPreds = model.predict(testData[preds])

In [21]:
#Create dataframe to store the results
results = pd.DataFrame(finalPreds, columns = ['contest-tmp2m-14d__tmp2m'])

results['index'] = testData['index']

results

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,29.693546,375734
1,29.656181,375735
2,29.735716,375736
3,29.746695,375737
4,29.604351,375738
...,...,...
31349,4.439295,407083
31350,5.537831,407084
31351,5.190330,407085
31352,4.375662,407086


In [22]:
results.to_csv('/Users/elliestrande/Desktop/WiDS/solution.csv', index = False)