# 30 Days of Machine Learning Competition 

---
## Step 1: Import Libraries

In [2]:
import pandas as pd
import numpy as np

# For encoding categorical variables & splitting data:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, KFold, GridSearchCV

# For models:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# For scoring:
from sklearn.metrics import mean_squared_error


---
## Step 2: Load the Data

In [3]:
# Load the training data:
training_df = pd.read_csv("data/train.csv", index_col=0)
testing_df = pd.read_csv("data/test.csv", index_col=0)

# Preview the data:
print(training_df.head())

   cat0 cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9  ...     cont5     cont6  \
id                                                    ...                       
1     B    B    B    C    B    B    A    E    C    N  ...  0.400361  0.160266   
2     B    B    A    A    B    D    A    F    A    O  ...  0.533087  0.558922   
3     A    A    A    C    B    D    A    D    A    F  ...  0.650609  0.375348   
4     B    B    A    C    B    D    A    E    C    K  ...  0.668980  0.239061   
6     A    A    A    C    B    D    A    E    A    N  ...  0.686964  0.420667   

       cont7     cont8     cont9    cont10    cont11    cont12    cont13  \
id                                                                         
1   0.310921  0.389470  0.267559  0.237281  0.377873  0.322401  0.869850   
2   0.516294  0.594928  0.341439  0.906013  0.921701  0.261975  0.465083   
3   0.902567  0.555205  0.843531  0.748809  0.620126  0.541474  0.763846   
4   0.732948  0.679618  0.574844  0.346010  0.714610

---
### Seperating the Data from the Target

In [4]:
# Seperate the target variable from the features:
y = training_df['target']
features = training_df.drop(['target'], axis=1)

# Preview features:
print(features.head())

   cat0 cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9  ...     cont4     cont5  \
id                                                    ...                       
1     B    B    B    C    B    B    A    E    C    N  ...  0.610706  0.400361   
2     B    B    A    A    B    D    A    F    A    O  ...  0.276853  0.533087   
3     A    A    A    C    B    D    A    D    A    F  ...  0.285074  0.650609   
4     B    B    A    C    B    D    A    E    C    K  ...  0.284667  0.668980   
6     A    A    A    C    B    D    A    E    A    N  ...  0.287595  0.686964   

       cont6     cont7     cont8     cont9    cont10    cont11    cont12  \
id                                                                         
1   0.160266  0.310921  0.389470  0.267559  0.237281  0.377873  0.322401   
2   0.558922  0.516294  0.594928  0.341439  0.906013  0.921701  0.261975   
3   0.375348  0.902567  0.555205  0.843531  0.748809  0.620126  0.541474   
4   0.239061  0.732948  0.679618  0.574844  0.346010

---
## Step 3: Prepare the Data


In [5]:
# List of categorical columns:
category_cols = [col for col in features.columns if 'cat' in col]

# Remove any categorical columns:
#category_cols.remove('cat2')
#category_cols.remove('cat4')
#category_cols.remove('cat6')


In [6]:
# List of numerical columns:
number_cols = [col for col in features.columns if 'cat' not in col]

# Remove any columns here:
#number_cols = number_cols.remove('col_name')

### Create Preprocessing Transformers

In [7]:
# Preprocessing for categorical columns:
cat_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder()),
    ]
)


# Preprocessing for numerical columns:
num_transformer = Pipeline(
    steps=[
        ('simple', SimpleImputer(strategy='constant')),
    ]
)

### Bundle Preprocessing Steps

In [8]:
# Bundle preprocessing into column tranformer:
preprocessor = ColumnTransformer(
    transformers = [
        ('cat', cat_transformer, category_cols),
        ('num', num_transformer, number_cols),
    ]
)

### Create a Copy of Our DataFrames

In [9]:
X = features.copy()
X_test = testing_df.copy()

### Split the Data Into a Training & Validation Set

In [10]:
# Split data:
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.3, random_state=0)

---
## Step 4: Setting Up & Training the Model

In [28]:
# Create the random grid:
params = {
    'num_leaves': np.arange(20, 501, 5),
    'max_depth': np.arange(10, 51, 5),
    'reg_alpha': np.arange(0.01, 0.5, 0.01),
    'min_data_in_leaf': np.arange(50, 501, 10),
    'colsample_bytree': [0.65, 0.75, 0.85, 0.95, 1],
    'subsample': [0.65, 0.75, 0.85, 0.95, 1],
}

print(params)

from functools import reduce
print(reduce(lambda x, y: x * y, [len(v) for k, v in params.items()]))

{'num_leaves': array([ 20,  25,  30,  35,  40,  45,  50,  55,  60,  65,  70,  75,  80,
        85,  90,  95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145,
       150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210,
       215, 220, 225, 230, 235, 240, 245, 250, 255, 260, 265, 270, 275,
       280, 285, 290, 295, 300, 305, 310, 315, 320, 325, 330, 335, 340,
       345, 350, 355, 360, 365, 370, 375, 380, 385, 390, 395, 400, 405,
       410, 415, 420, 425, 430, 435, 440, 445, 450, 455, 460, 465, 470,
       475, 480, 485, 490, 495, 500]), 'max_depth': array([10, 15, 20, 25, 30, 35, 40, 45, 50]), 'reg_alpha': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
       0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33,
       0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
       0.45, 0.46, 0.47, 0.48, 0.49]), 'min_data_in_leaf': array([ 50,  60,  70

In [30]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
xgb_reg = XGBRegressor(
            learning_rate=0.01,
            n_estimators=10000,
            random_state=0,
            subsample_freq=5,
            tree_method='gpu_hist'
)

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
xgb_grid = GridSearchCV(
    estimator=xgb_reg,
    param_grid=params,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=2,
    n_jobs=-1
)

X_processed = preprocessor.fit_transform(X_train)


In [None]:
# Fit the random search model
xgb_grid.fit(X_processed, y_train)


In [None]:
# View the best params from fitting the random search:
print(xgb_grid.best_params_)


In [None]:
# Evalute the random search model:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy


In [28]:
# Define the model + parameters:
model = RandomForestRegressor(n_estimators=115,
                              random_state=0,
                              n_jobs=-1)


#model = XGBRegressor(n_estimators=350,
#                     learning_rate=0.05,
#                     n_jobs=-1)

### Create a Pipeline

In [29]:
# Define a main pipeline:
my_pipeline = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', model),
    ]
)

### Train Our Model Using the Pipeline

In [30]:
# Fit the model:
my_pipeline.fit(X_train, y_train)


'''
# Preformat
preprocessor.fit(X_valid)
X_valid_transformed = preprocessor.transform(X_valid)


my_pipeline.fit(X_train, y_train,
                model__early_stopping_rounds=20,
                model__eval_set=[(X_valid_transformed, y_valid)],
                model__verbose=False
               )
'''

'\n# Preformat\npreprocessor.fit(X_valid)\nX_valid_transformed = preprocessor.transform(X_valid)\n\n\nmy_pipeline.fit(X_train, y_train,\n                model__early_stopping_rounds=20,\n                model__eval_set=[(X_valid_transformed, y_valid)],\n                model__verbose=False\n               )\n'

---
## Step 5: Evaluating Our Model


### Make A Prediction On the Validation Set

In [31]:
# Generate prediction about the validation dataset:
pred_validate = my_pipeline.predict(X_validate)

### Score Our Predictions

In [32]:
# Score using mean squared error (minus the squared for the competition):
mse = mean_squared_error(y_validate, pred_validate, squared=False)

print("MSE: ", round(mse, 7))  # 0.737501

MSE:  0.7375536


### Determine Feature Importance

In [17]:
# Create a DataFrame to display feature importances:
feature_importances = pd.DataFrame({
    'features': category_cols + number_cols,
    'importance': model.feature_importances_ * 100,
})

print(feature_importances)

   features  importance
0      cat0    0.652925
1      cat1    0.506290
2      cat2    0.475970
3      cat3    0.619199
4      cat5    0.846176
5      cat7    0.516848
6      cat8    1.449139
7      cat9    3.063522
8     cont0    6.345342
9     cont1    6.215426
10    cont2    6.819642
11    cont3    6.452880
12    cont4    6.469812
13    cont5    6.606862
14    cont6    6.237868
15    cont7    6.540410
16    cont8    6.211484
17    cont9    6.666669
18   cont10    7.216699
19   cont11    6.368271
20   cont12    7.247675
21   cont13    6.470889


### Implement Cross Validation For Better Results

In [None]:
'''
kfold = KFold(shuffle=True, n_splits=4)

cv_scores = -1 * cross_val_score(my_pipeline, X, y,
                                 cv=kfold,
                                 n_jobs=-1,
                                 scoring='neg_root_mean_squared_error')
print(cv_scores)
'''

"\nkfold = KFold(shuffle=True, n_splits=4)\n\ncv_scores = -1 * cross_val_score(my_pipeline, X, y,\n                                 cv=kfold,\n                                 n_jobs=-1,\n                                 scoring='neg_root_mean_squared_error')\nprint(cv_scores)\n"

---
## Step 6: Create a Submission File

In [None]:
# Use the model to make predictions:
predictions = my_pipeline.predict(X_test)


In [None]:
# Save the predictions to a CSV file:
output = pd.DataFrame({
    'Id': X_test.index,
    'target': predictions,
})

output.to_csv("submission.csv", index=False)

---
## Step 7: Alternative Approaches

If you're not sure what to do next, you can begin by trying out more model types!
1. If you took the **[Intermediate Machine Learning](https://www.kaggle.com/learn/intermediate-machine-learning)** course, then you learned about **[XGBoost](https://www.kaggle.com/alexisbcook/xgboost)**.  Try training a model with XGBoost, to improve over the performance you got here.
​
2. Take the time to learn about **Light GBM (LGBM)**, which is similar to XGBoost, since they both use gradient boosting to iteratively add decision trees to an ensemble.  In case you're not sure how to get started, **[here's a notebook](https://www.kaggle.com/svyatoslavsokolov/tps-feb-2021-lgbm-simple-version)** that trains a model on a similar dataset.