# 30 Days of ML Competition
---

### **Step 1: Import Libraries**

In [2]:
#!pip install numpy pandas scikit-learn xgboost keras --quiet

In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (train_test_split, cross_val_score, KFold,
                                     RandomizedSearchCV, GridSearchCV)
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor


---

### **Step 2: Load the Data**


In [2]:
# Load the training data:
training_df = pd.read_csv("data/train.csv", index_col=0)
testing_df = pd.read_csv("data/test.csv", index_col=0)

# Preview the data:
print(training_df.head())

   cat0 cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9  ...     cont5     cont6  \
id                                                    ...                       
1     B    B    B    C    B    B    A    E    C    N  ...  0.400361  0.160266   
2     B    B    A    A    B    D    A    F    A    O  ...  0.533087  0.558922   
3     A    A    A    C    B    D    A    D    A    F  ...  0.650609  0.375348   
4     B    B    A    C    B    D    A    E    C    K  ...  0.668980  0.239061   
6     A    A    A    C    B    D    A    E    A    N  ...  0.686964  0.420667   

       cont7     cont8     cont9    cont10    cont11    cont12    cont13  \
id                                                                         
1   0.310921  0.389470  0.267559  0.237281  0.377873  0.322401  0.869850   
2   0.516294  0.594928  0.341439  0.906013  0.921701  0.261975  0.465083   
3   0.902567  0.555205  0.843531  0.748809  0.620126  0.541474  0.763846   
4   0.732948  0.679618  0.574844  0.346010  0.714610

In [3]:

# Seperate the Data from the Target:
y = training_df["target"]
features = training_df.drop(["target"], axis=1)

# Drop noise features:
dropped_features = ["cat0", "cat1", "cat2", "cat3", "cat4", "cat6", "cat7", 'cat9']
features = features.drop(dropped_features, axis=1)

#Add features:
features["cat1_A"] = training_df["cat1"].apply(lambda x: 1 if x == "A" else 0)
features["cat8_C"] = training_df["cat8"].apply(lambda x: 1 if x == "C" else 0)
features["cat8_E"] = training_df["cat8"].apply(lambda x: 1 if x == "E" else 0)


print(features.head())

   cat5 cat8     cont0     cont1     cont2     cont3     cont4     cont5  \
id                                                                         
1     B    C  0.201470 -0.014822  0.669699  0.136278  0.610706  0.400361   
2     D    A  0.743068  0.367411  1.021605  0.365798  0.276853  0.533087   
3     D    A  0.742708  0.310383 -0.012673  0.576957  0.285074  0.650609   
4     D    C  0.429551  0.620998  0.577942  0.280610  0.284667  0.668980   
6     D    A  1.058291  0.367492 -0.052389  0.232407  0.287595  0.686964   

       cont6     cont7     cont8     cont9    cont10    cont11    cont12  \
id                                                                         
1   0.160266  0.310921  0.389470  0.267559  0.237281  0.377873  0.322401   
2   0.558922  0.516294  0.594928  0.341439  0.906013  0.921701  0.261975   
3   0.375348  0.902567  0.555205  0.843531  0.748809  0.620126  0.541474   
4   0.239061  0.732948  0.679618  0.574844  0.346010  0.714610  0.540150   
6   0.42066

---
### **Step 3: Prepare the Data**

#### Seperate Our Variable Types:

In [4]:
# List of the categorical columns:
#categorical_cols = [col for col in features.columns if 'cat' in col]
categorical_cols = ['cat5', 'cat8']

#-----------------------------------------------------------------------

# List of the numerical columns:
number_cols = [col for col in features.columns if 'cat' not in col]

#-----------------------------------------------------------------------

# Additional binary features:
binary_cols = ['cat1_A', 'cat8_C', 'cat8_E']

#### Create Preproprocessing Transformers

In [5]:
# Preprocess categorical data:
category_transformer = Pipeline(
    steps=[
      ('imputer', SimpleImputer(strategy='most_frequent')),
      ('ordinal', OrdinalEncoder()),
    ]
)


# Preprocess numerical data:
number_transformer = Pipeline(
    steps=[
      ('simple', SimpleImputer(strategy='constant')),
    ]
)


In [6]:
# Combine the preprocessing steps into column transformer:
preprocessor = ColumnTransformer(
    transformers=[
      ('cat', category_transformer, categorical_cols),
      ('num', number_transformer, number_cols),
    ]
)

#### Copy Our DataFrames

In [7]:
X = features.copy()
X_test = testing_df.copy()


#### Split the Data Into Training & Validation Sets

In [8]:
# Split data:
X_train, X_validate, y_train, y_validate = train_test_split(
    X, y,
    test_size=0.25,
    random_state=0
)


---

### **Step 4: Setting Up & Training the Model**

In [43]:

# Create the random grid:
params = {
        'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05],
}

print(params)

X_validate_processed = preprocessor.fit_transform(X_validate)
# Configure the model to use GPU:
xgb_reg = XGBRegressor(
    n_estimators=1160,
    learning_rate=0.03,
    min_split_loss=0.06,
    colsample_bytree=0.3,
    objective='reg:squarederror',
    subsample=0.85,
    tree_method='gpu_hist',
    gpu_id=0,
)

# Random search of parameters, using 3 fold cross validation
xgb_grid = GridSearchCV(
    estimator=xgb_reg,
    param_grid=params,
    scoring='neg_root_mean_squared_error',
    cv=2,
    verbose=2,
    n_jobs=-1,
)

# Preprocess training data:
X_processed = preprocessor.fit_transform(X_train)


{'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05]}


In [34]:
# Fit the Random Search model:
xgb_grid.fit(X_processed, y_train)

# See the best params from fitting the random search:
print(xgb_grid.best_params_)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


         nan         nan         nan         nan]


{'max_depth': 10}


In [35]:
# Evaluate the random search model:
def evaluate(model, test_features, test_labels):
  predictions = model.predict(test_features)
  errors = abs(predictions - test_labels)
  mape = 100 * np.mean(errors / test_labels)
  accuracy = 100 - mape
  print('Model Performance')
  print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
  print('Accuracy = {:0.2f}%.'.format(accuracy))
    
  return accuracy


evaluate(xgb_grid, X_processed, y_train)


Model Performance
Average Error: 0.4114 degrees.
Accuracy = 94.95%.


94.94966440535256

#### Create a Pipeline

In [51]:

my_pipeline = Pipeline(
    steps=[
      ('preprocessor', preprocessor),
      ('model', XGBRegressor(
                  n_estimators=1160,
                  learning_rate=0.03,
                  random_state=0,
                  n_jobs=-1,
                  subsample=0.85,
                  colsample_bytree=0.3,
                  min_child_weight=22,
                  min_split_loss=0.06,
                  tree_method='gpu_hist',
                  gpu_id=0
                )
      ),
    ]
)


#### Train the Model

In [52]:
my_pipeline.fit(X_train, y_train)

XGBoostError: [18:15:07] C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:103: label must be in [0,1] for logistic regression

---
### **Step 5: Evaluate Our Model**


In [49]:
# Generate predictions on the validation set:
pred_validate = my_pipeline.predict(X_validate)

# Score Our Predictions -- using mean root squared error
mse = mean_squared_error(y_validate, pred_validate, squared=False)

print("MSE: ", round(mse, 7))


MSE:  0.7240738


#### Determine Feature Importance

---
### **Final Step: Create the Submission File**

In [42]:
X_test = X_test.drop(dropped_features, axis=1)
#Add features:
X_test["cat1_A"] = testing_df["cat1"].apply(lambda x: 1 if x == "A" else 0)
X_test["cat8_C"] = testing_df["cat8"].apply(lambda x: 1 if x == "C" else 0)
X_test["cat8_E"] = testing_df["cat8"].apply(lambda x: 1 if x == "E" else 0)


# Use the model to make predictions:
predictions = my_pipeline.predict(X_test)

# Save the predictions to a CSV file:
output = pd.DataFrame({
    'Id': X_test.index,
    'target': predictions,
})

output.to_csv("submission.csv", index=False)
