In [None]:

# library imports
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os

# preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# modeling
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# evaluation
from sklearn.metrics import mean_absolute_error

In [3]:
# Data Paths
TRAIN_DATA_PATH = os.path.join("data", "train.csv")
TEST_DATA_PATH = os.path.join("data", "test.csv")

## Understanding Data

In [4]:
full_df = pd.read_csv(TRAIN_DATA_PATH)
full_X_test = pd.read_csv(TEST_DATA_PATH)

In [5]:
# Exploring the data
print(f"Shape of the full dataset {full_df.shape}")
print(f"Shape of the test dataset {full_X_test.shape}")

Shape of the full dataset (300000, 26)
Shape of the test dataset (200000, 25)


In [6]:
full_df.describe()

Unnamed: 0,id,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
count,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0
mean,250018.576947,0.527335,0.460926,0.490498,0.496689,0.491654,0.510526,0.467476,0.537119,0.498456,0.474872,0.474492,0.473216,0.494561,0.508273,8.241979
std,144450.15001,0.230599,0.214003,0.253346,0.219199,0.240074,0.228232,0.210331,0.21814,0.23992,0.218007,0.255949,0.222022,0.247292,0.22295,0.746555
min,1.0,-0.118039,-0.069309,-0.056104,0.130676,0.255908,0.045915,-0.224689,0.203763,-0.260275,0.117896,0.048732,0.052608,-0.074208,0.15105,0.140329
25%,124772.5,0.405965,0.310494,0.300604,0.329783,0.284188,0.354141,0.342873,0.355825,0.332486,0.306874,0.276017,0.308151,0.289074,0.300669,7.742071
50%,250002.5,0.497053,0.427903,0.502462,0.465026,0.39047,0.488865,0.429383,0.504661,0.439151,0.43462,0.459975,0.433812,0.422887,0.4724,8.191373
75%,375226.5,0.66806,0.615113,0.647512,0.664451,0.696599,0.669625,0.573383,0.703441,0.606056,0.614333,0.691579,0.642057,0.714502,0.758447,8.728634
max,499999.0,1.058443,0.887253,1.034704,1.03956,1.055424,1.067649,1.111552,1.032837,1.040229,0.982922,1.05596,1.071444,0.975035,0.905992,10.411992


In [7]:
# checking for null values in the full dataset
full_df.isnull().any().sum() # no null value in dataset

0

In [8]:
y = full_df.target.copy()
X = full_df.drop('target', axis=1).copy()

In [9]:
X.shape

(300000, 25)

In [10]:
# splitting the data
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2,
                                                      random_state= 1)

In [11]:
# categorical columns start with cat
categorical_cols = [cname for cname in X.columns if 'cat' in cname]
# numerical columns starts with cont
numerical_cols = [cname for cname in X.columns if 'cont' in cname]

In [12]:

# checking the cardinality of the categorical columns
for cname in categorical_cols:
    num_unique = X[cname].nunique()
    print(f"{cname} has {num_unique}")
    if num_unique > 10:
        print(f"\t{cname} has a high cardinality")

# eventhough cat9 col has more than 10 unique values, we will still use the OnehotEncoder 

cat0 has 2
cat1 has 2
cat2 has 2
cat3 has 4
cat4 has 4
cat5 has 4
cat6 has 8
cat7 has 8
cat8 has 7
cat9 has 15
	cat9 has a high cardinality


In [13]:
# numerical and categorical transformations
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, numerical_cols),
    ('cat', cat_transformer, categorical_cols),
])


In [14]:
# random forest model & pipeline with default values 
rf_model = RandomForestRegressor(random_state=1)
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_model),
])

In [15]:
# fitting the first model
rf_pipeline.fit(X_train, y_train)

In [None]:
# predictions and evaluation
rf_predictions = rf_pipeline.predict(X,valid)
rf_mae = mean_absolute_error(y, rf_predictions)
print(f"MAE is {rf_mae:0.2f}")


In [None]:
# output
test_prediction = rf_pipeline.predict(X_test)
output = pd.DataFrame({"id": X_test.index,
    "target": test_prediction})
output.to_csv(os.path.join("output", "submission1.csv"), index=False)

In [None]:
def get_score(pipeline, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    pipeline.fit(X_t, y_t)
    preds = pipeline.predict(X_v)
    return mean_absolute_error(y_v, preds)
    