# 30 Days of ML Competition
---

### **Step 1: Import Libraries**

In [None]:
#!pip install numpy pandas scikit-learn xgboost keras --quiet

In [27]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (train_test_split, cross_val_score, KFold,
                                     RandomizedSearchCV)
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor


---

### **Step 2: Load the Data**


In [28]:
# Load the training data:
training_df = pd.read_csv("data/train.csv", index_col=0)
testing_df = pd.read_csv("data/test.csv", index_col=0)

# Preview the data:
print(training_df.head())

   cat0 cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9  ...     cont5     cont6  \
id                                                    ...                       
1     B    B    B    C    B    B    A    E    C    N  ...  0.400361  0.160266   
2     B    B    A    A    B    D    A    F    A    O  ...  0.533087  0.558922   
3     A    A    A    C    B    D    A    D    A    F  ...  0.650609  0.375348   
4     B    B    A    C    B    D    A    E    C    K  ...  0.668980  0.239061   
6     A    A    A    C    B    D    A    E    A    N  ...  0.686964  0.420667   

       cont7     cont8     cont9    cont10    cont11    cont12    cont13  \
id                                                                         
1   0.310921  0.389470  0.267559  0.237281  0.377873  0.322401  0.869850   
2   0.516294  0.594928  0.341439  0.906013  0.921701  0.261975  0.465083   
3   0.902567  0.555205  0.843531  0.748809  0.620126  0.541474  0.763846   
4   0.732948  0.679618  0.574844  0.346010  0.714610

In [76]:

# Seperate the Data from the Target:
y = training_df["target"]
features = training_df.drop(["target"], axis=1)

# Drop additonal features:
dropped_features = ["cat0", "cat2", "cat3", "cat4", "cat6", "cat7", 'cat9']
features = features.drop(dropped_features, axis=1)


print(features.head())

   cat1 cat5 cat8     cont0     cont1     cont2     cont3     cont4     cont5  \
id                                                                              
1     B    B    C  0.201470 -0.014822  0.669699  0.136278  0.610706  0.400361   
2     B    D    A  0.743068  0.367411  1.021605  0.365798  0.276853  0.533087   
3     A    D    A  0.742708  0.310383 -0.012673  0.576957  0.285074  0.650609   
4     B    D    C  0.429551  0.620998  0.577942  0.280610  0.284667  0.668980   
6     A    D    A  1.058291  0.367492 -0.052389  0.232407  0.287595  0.686964   

       cont6     cont7     cont8     cont9    cont10    cont11    cont12  \
id                                                                         
1   0.160266  0.310921  0.389470  0.267559  0.237281  0.377873  0.322401   
2   0.558922  0.516294  0.594928  0.341439  0.906013  0.921701  0.261975   
3   0.375348  0.902567  0.555205  0.843531  0.748809  0.620126  0.541474   
4   0.239061  0.732948  0.679618  0.574844  0.346010

---
### **Step 3: Prepare the Data**

#### Seperate Our Variable Types:

In [77]:
# List of the categorical columns:
#categorical_cols = [col for col in features.columns if 'cat' in col]
categorical_cols = ['cat1', 'cat5', 'cat8']

# Remove any category columns with little importance:
#categorical_cols.remove('col_name')

#-----------------------------------------------------------------------

# List of the numerical columns:
number_cols = [col for col in features.columns if 'cat' not in col]

# Remove any columns here:
#number_cols = number_cols.remove('col_name')

#### Create Preproprocessing Transformers

In [78]:
# Preprocess categorical data:
category_transformer = Pipeline(
    steps=[
      ('imputer', SimpleImputer(strategy='most_frequent')),
      ('ordinal', OrdinalEncoder()),
    ]
)


# Preprocess numerical data:
number_transformer = Pipeline(
    steps=[
      ('simple', SimpleImputer(strategy='constant')),
    ]
)


In [79]:
# Combine the preprocessing steps into column transformer:
preprocessor = ColumnTransformer(
    transformers=[
      ('cat', category_transformer, categorical_cols),
      ('num', number_transformer, number_cols),
    ]
)

#### Copy Our DataFrames

In [80]:
X = features.copy()
X_test = testing_df.copy()


#### Split the Data Into Training & Validation Sets

In [81]:
# Split data:
X_train, X_validate, y_train, y_validate = train_test_split(
    X, y,
    test_size=0.25,
    random_state=0
)


---

### **Step 4: Setting Up & Training the Model**

In [82]:
# Define the model + parameters:
model = RandomForestRegressor()


# Define option parameters for random search:
params = {
    'n_estimators': [300],
    'max_features': ['sqrt'],
    'max_depth': [20, 30, 40, 50, None],
    'min_samples_split': [10],
    'min_samples_leaf': [8],
    'bootstrap': [True],
}

random_search_cv = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    n_iter=25,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=2,
    random_state=0,
    n_jobs=10
)
'''
Fitting 3 folds for each of 18 candidates, totalling 54 fits
{'n_estimators': 300, 'min_samples_split': 10, 'max_features': 'sqrt'}
'''


"\nFitting 3 folds for each of 18 candidates, totalling 54 fits\n{'n_estimators': 300, 'min_samples_split': 10, 'max_features': 'sqrt'}\n"

In [83]:
# Preprocess training data:
X_preprocessed = preprocessor.fit_transform(X_train)


In [67]:
# Fit the Random Search model:
random_search_cv.fit(X_preprocessed, y_train)

# See the best params from fitting the random search:
print(random_search_cv.best_params_)



Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': True}


In [68]:
# Evaluate the random search model:
def evaluate(model, test_features, test_labels):
  predictions = model.predict(test_features)
  errors = abs(predictions - test_labels)
  mape = 100 * np.mean(errors / test_labels)
  accuracy = 100 - mape
  print('Model Performance')
  print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
  print('Accuracy = {:0.2f}%.'.format(accuracy))
    
  return accuracy


evaluate(random_search_cv, X_preprocessed, y_train)


Model Performance
Average Error: 0.4441 degrees.
Accuracy = 94.52%.


94.52119273762196

#### Create a Pipeline

In [84]:

my_pipeline = Pipeline(
    steps=[
      ('preprocessor', preprocessor),
      ('model', RandomForestRegressor(
                  n_estimators=300,
                  random_state=0,
                  min_samples_split=10,
                  min_samples_leaf=8,
                  n_jobs=-1,
                  max_features= 'sqrt',
                  max_depth=30,
                  bootstrap=True,
                )
      ),
    ]
)


#### Train the Model

In [85]:
my_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ordinal',
                                                                   OrdinalEncoder())]),
                                                  ['cat1', 'cat5', 'cat8']),
                                                 ('num',
                                                  Pipeline(steps=[('simple',
                                                                   SimpleImputer(strategy='constant'))]),
                                                  ['cont0', 'cont1', 'cont2',
                                                   'cont3', 'cont4', 'cont5',
                                                   'cont6', 'cont7', 'cont8',
      

---
### **Step 5: Evaluate Our Model**


In [86]:
# Generate predictions on the validation set:
pred_validate = my_pipeline.predict(X_validate)

# Score Our Predictions -- using mean root squared error
mse = mean_squared_error(y_validate, pred_validate, squared=False)

print("MSE: ", round(mse, 7))


MSE:  0.7333595


#### Determine Feature Importance

---
### **Final Step: Create the Submission File**

In [75]:
X_test = X_test.drop(dropped_features, axis=1)

# Use the model to make predictions:
predictions = my_pipeline.predict(X_test)

# Save the predictions to a CSV file:
output = pd.DataFrame({
    'Id': X_test.index,
    'target': predictions,
})

output.to_csv("submission.csv", index=False)
