<a href="https://colab.research.google.com/github/chrismarkella/Kaggle-access-from-Google-Colab/blob/master/machine_learning_categorical_monday.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt-get -qq install tree

In [0]:
import os

import numpy as np
import pandas as pd

from getpass import getpass 

In [0]:
def access_kaggle():
    """
    Access Kaggle from Google Colab.
    If the /root/.kaggle does not exist then prompt for
    the username and for the Kaggle API key.
    Creates the kaggle.json access file in the /root/.kaggle/ folder. 
    """
    KAGGLE_ROOT = os.path.join('/root', '.kaggle')
    KAGGLE_PATH = os.path.join(KAGGLE_ROOT, 'kaggle.json')

    if '.kaggle' not in os.listdir(path='/root'):
        user = getpass(prompt='Kaggle username: ')
        key  = getpass(prompt='Kaggle API key: ')
        
        !mkdir $KAGGLE_ROOT
        !touch $KAGGLE_PATH
        !chmod 666 $KAGGLE_PATH
        with open(KAGGLE_PATH, mode='w') as f:
            f.write('{"username":"%s", "key":"%s"}' %(user, key))
            f.close()
        !chmod 600 $KAGGLE_PATH
        del user
        del key
        success_msg = "Kaggle is successfully set up. Good to go."
        print(f'{success_msg}')

access_kaggle()


In [0]:
!kaggle competitions download -c home-data-for-ml-course -p datasets/ml-course
!tree -sh ./
!cat -n datasets/ml-course/train.csv|head -2
df = pd.read_csv('datasets/ml-course/train.csv', sep=',', index_col=0)

df.columns = df.columns.map(lambda c: c.lower())
df.columns

### Steps:


*   step 1: `target`, `predictors` separation

*    step 2: `train`, `test` separation

*    step 3: drop columns with `missing values`

*    step 4: `numerical` and `categorical` columns


*    step 5: `train_x`, `valid_x` with `my_columns`




### Three approaches:



*   Approach 1: Simply `drop` the `categorical columns`.
*   Approach 2: `Label encoding`
*   Approach 3: `One-Hot` Encoding





In [19]:
# step 1: target, predictors separation

# Before we separate the target column we need to know that it
# does not have any missing values.
print(f'# of missing values in the salesprice column: {df.saleprice.isnull().sum()}')

# of missing values in the salesprice column: 0


In [0]:
# We are good to go with the salesprice column as a target.
y = df.saleprice

# Predictors in X.
X = df.drop(['saleprice'], axis='columns')

In [23]:
# step 2: train, test separation
from sklearn.model_selection import train_test_split

train_x_full, valid_x_full, train_y, valid_y = train_test_split(X, y, test_size=0.2,
                                                                random_state=42)
# checking that shape of train_x_full and valid_x_full:
X.shape, train_x_full.shape, valid_x_full.shape

((1460, 79), (1168, 79), (292, 79))



```python
>>> train_x_full.shape
```
gives us `1168` records out of the total `1460` records.

```python
>>> valid_x_full.shape
```
gives us `292` records out of the total `1460` records.

This allines with the 20/80 percent test/train data split.



In [24]:
# step 3: dropping all the missing value columns

# One way to see all the missing value columns:
nullcolumns = train_x_full.isnull().sum()
nullcolumns[nullcolumns > 0]

lotfrontage      217
alley           1094
masvnrtype         6
masvnrarea         6
bsmtqual          28
bsmtcond          28
bsmtexposure      28
bsmtfintype1      28
bsmtfintype2      28
electrical         1
fireplacequ      547
garagetype        64
garageyrblt       64
garagefinish      64
garagequal        64
garagecond        64
poolqc          1162
fence            935
miscfeature     1122
dtype: int64

In [25]:
# Another way to get the list of the missing value columns:
columns_with_NaN = [col for col in train_x_full.columns if train_x_full[col].isnull().any()]
columns_with_NaN

['lotfrontage',
 'alley',
 'masvnrtype',
 'masvnrarea',
 'bsmtqual',
 'bsmtcond',
 'bsmtexposure',
 'bsmtfintype1',
 'bsmtfintype2',
 'electrical',
 'fireplacequ',
 'garagetype',
 'garageyrblt',
 'garagefinish',
 'garagequal',
 'garagecond',
 'poolqc',
 'fence',
 'miscfeature']

In [26]:
# dropping all the missing value columns:
train_x_full.drop(columns_with_NaN, axis='columns', inplace=True)
valid_x_full.drop(columns_with_NaN, axis='columns', inplace=True)

# checking the shape of train_x_full after dropping the columns:
train_x_full.shape, valid_x_full.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


((1168, 60), (292, 60))

### Only `60` columns left from the `79` predictors.

In [27]:
# step 4: numerical and categorical columns:

# first let's see the remaining columns' types:
train_x_full.dtypes

mssubclass        int64
mszoning         object
lotarea           int64
street           object
lotshape         object
landcontour      object
utilities        object
lotconfig        object
landslope        object
neighborhood     object
condition1       object
condition2       object
bldgtype         object
housestyle       object
overallqual       int64
overallcond       int64
yearbuilt         int64
yearremodadd      int64
roofstyle        object
roofmatl         object
exterior1st      object
exterior2nd      object
exterqual        object
extercond        object
foundation       object
bsmtfinsf1        int64
bsmtfinsf2        int64
bsmtunfsf         int64
totalbsmtsf       int64
heating          object
heatingqc        object
centralair       object
1stflrsf          int64
2ndflrsf          int64
lowqualfinsf      int64
grlivarea         int64
bsmtfullbath      int64
bsmthalfbath      int64
fullbath          int64
halfbath          int64
bedroomabvgr      int64
kitchenabvgr    

In [32]:
[f'{col:15}{train_x_full[col].dtype}' for col in train_x_full.columns if train_x_full[col].dtype in ['int64', 'float64']]

['mssubclass     int64',
 'lotarea        int64',
 'overallqual    int64',
 'overallcond    int64',
 'yearbuilt      int64',
 'yearremodadd   int64',
 'bsmtfinsf1     int64',
 'bsmtfinsf2     int64',
 'bsmtunfsf      int64',
 'totalbsmtsf    int64',
 '1stflrsf       int64',
 '2ndflrsf       int64',
 'lowqualfinsf   int64',
 'grlivarea      int64',
 'bsmtfullbath   int64',
 'bsmthalfbath   int64',
 'fullbath       int64',
 'halfbath       int64',
 'bedroomabvgr   int64',
 'kitchenabvgr   int64',
 'totrmsabvgrd   int64',
 'fireplaces     int64',
 'garagecars     int64',
 'garagearea     int64',
 'wooddecksf     int64',
 'openporchsf    int64',
 'enclosedporch  int64',
 '3ssnporch      int64',
 'screenporch    int64',
 'poolarea       int64',
 'miscval        int64',
 'mosold         int64',
 'yrsold         int64']

In [34]:
[f'{col:15}{train_x_full[col].nunique()}' for col in train_x_full.columns if train_x_full[col].dtype == 'object']

['mszoning       5',
 'street         2',
 'lotshape       4',
 'landcontour    4',
 'utilities      2',
 'lotconfig      5',
 'landslope      3',
 'neighborhood   25',
 'condition1     9',
 'condition2     8',
 'bldgtype       5',
 'housestyle     8',
 'roofstyle      6',
 'roofmatl       7',
 'exterior1st    15',
 'exterior2nd    16',
 'exterqual      4',
 'extercond      5',
 'foundation     6',
 'heating        6',
 'heatingqc      5',
 'centralair     2',
 'kitchenqual    4',
 'functional     7',
 'paveddrive     3',
 'saletype       9',
 'salecondition  6']

In [35]:
numerical_columns = [col for col in train_x_full.columns if train_x_full[col].dtype in ['int64', 'float64']]
numerical_columns

['mssubclass',
 'lotarea',
 'overallqual',
 'overallcond',
 'yearbuilt',
 'yearremodadd',
 'bsmtfinsf1',
 'bsmtfinsf2',
 'bsmtunfsf',
 'totalbsmtsf',
 '1stflrsf',
 '2ndflrsf',
 'lowqualfinsf',
 'grlivarea',
 'bsmtfullbath',
 'bsmthalfbath',
 'fullbath',
 'halfbath',
 'bedroomabvgr',
 'kitchenabvgr',
 'totrmsabvgrd',
 'fireplaces',
 'garagecars',
 'garagearea',
 'wooddecksf',
 'openporchsf',
 'enclosedporch',
 '3ssnporch',
 'screenporch',
 'poolarea',
 'miscval',
 'mosold',
 'yrsold']

In [36]:
low_cardinality_columns = [col for col in train_x_full.columns if train_x_full[col].dtype == 'object' and train_x_full[col].nunique() < 10]
low_cardinality_columns

['mszoning',
 'street',
 'lotshape',
 'landcontour',
 'utilities',
 'lotconfig',
 'landslope',
 'condition1',
 'condition2',
 'bldgtype',
 'housestyle',
 'roofstyle',
 'roofmatl',
 'exterqual',
 'extercond',
 'foundation',
 'heating',
 'heatingqc',
 'centralair',
 'kitchenqual',
 'functional',
 'paveddrive',
 'saletype',
 'salecondition']

In [38]:
selected_columns = low_cardinality_columns + numerical_columns
selected_columns

['mszoning',
 'street',
 'lotshape',
 'landcontour',
 'utilities',
 'lotconfig',
 'landslope',
 'condition1',
 'condition2',
 'bldgtype',
 'housestyle',
 'roofstyle',
 'roofmatl',
 'exterqual',
 'extercond',
 'foundation',
 'heating',
 'heatingqc',
 'centralair',
 'kitchenqual',
 'functional',
 'paveddrive',
 'saletype',
 'salecondition',
 'mssubclass',
 'lotarea',
 'overallqual',
 'overallcond',
 'yearbuilt',
 'yearremodadd',
 'bsmtfinsf1',
 'bsmtfinsf2',
 'bsmtunfsf',
 'totalbsmtsf',
 '1stflrsf',
 '2ndflrsf',
 'lowqualfinsf',
 'grlivarea',
 'bsmtfullbath',
 'bsmthalfbath',
 'fullbath',
 'halfbath',
 'bedroomabvgr',
 'kitchenabvgr',
 'totrmsabvgrd',
 'fireplaces',
 'garagecars',
 'garagearea',
 'wooddecksf',
 'openporchsf',
 'enclosedporch',
 '3ssnporch',
 'screenporch',
 'poolarea',
 'miscval',
 'mosold',
 'yrsold']

In [40]:
# step 5: keep only the selected columns
train_x = train_x_full[selected_columns].copy()
valid_x = valid_x_full[selected_columns].copy()

train_x.shape, valid_x.shape

((1168, 57), (292, 57))

In [41]:
train_x.head(3)

Unnamed: 0_level_0,mszoning,street,lotshape,landcontour,utilities,lotconfig,landslope,condition1,condition2,bldgtype,housestyle,roofstyle,roofmatl,exterqual,extercond,foundation,heating,heatingqc,centralair,kitchenqual,functional,paveddrive,saletype,salecondition,mssubclass,lotarea,overallqual,overallcond,yearbuilt,yearremodadd,bsmtfinsf1,bsmtfinsf2,bsmtunfsf,totalbsmtsf,1stflrsf,2ndflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,totrmsabvgrd,fireplaces,garagecars,garagearea,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,miscval,mosold,yrsold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
255,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,1Fam,1Story,Gable,CompShg,TA,Gd,CBlock,GasA,TA,Y,TA,Typ,Y,WD,Normal,20,8400,5,6,1957,1957,922,0,392,1314,1314,0,0,1314,1,0,1,0,3,1,5,0,1,294,250,0,0,0,0,0,0,6,2010
1067,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,Norm,Norm,1Fam,2Story,Gable,CompShg,Gd,TA,PConc,GasA,Gd,Y,TA,Typ,Y,WD,Normal,60,7837,6,7,1993,1994,0,0,799,799,799,772,0,1571,0,0,2,1,3,1,7,1,2,380,0,40,0,0,0,0,0,5,2009
639,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,Feedr,Norm,1Fam,1Story,Gable,CompShg,TA,TA,CBlock,GasA,Gd,Y,TA,Typ,P,WD,Normal,30,8777,5,7,1910,1950,0,0,796,796,796,0,0,796,0,0,1,0,2,1,4,0,0,0,328,0,164,0,0,0,0,5,2008


In [42]:
condition_obj = train_x.dtypes == 'object'
train_x.dtypes[condition_obj]

mszoning         object
street           object
lotshape         object
landcontour      object
utilities        object
lotconfig        object
landslope        object
condition1       object
condition2       object
bldgtype         object
housestyle       object
roofstyle        object
roofmatl         object
exterqual        object
extercond        object
foundation       object
heating          object
heatingqc        object
centralair       object
kitchenqual      object
functional       object
paveddrive       object
saletype         object
salecondition    object
dtype: object

In [43]:
categorical_columns = list(train_x.dtypes[condition_obj].index)
categorical_columns

['mszoning',
 'street',
 'lotshape',
 'landcontour',
 'utilities',
 'lotconfig',
 'landslope',
 'condition1',
 'condition2',
 'bldgtype',
 'housestyle',
 'roofstyle',
 'roofmatl',
 'exterqual',
 'extercond',
 'foundation',
 'heating',
 'heatingqc',
 'centralair',
 'kitchenqual',
 'functional',
 'paveddrive',
 'saletype',
 'salecondition']

In [0]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(train_x, valid_x, train_y, valid_y):
    """Return the MAE of the dataset.
    """
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(train_x, train_y)
    preds = model.predict(valid_x)
    return mean_absolute_error(y_true=valid_y, y_pred=preds)


### Approach 1: Simply drop the `categorical` columns.

In [47]:
drop_train_x = train_x.select_dtypes(exclude=['object'])
drop_valid_x = valid_x.select_dtypes(exclude=['object'])

score_dataset(train_x=drop_train_x, valid_x=drop_valid_x,
              train_y=train_y, valid_y=valid_y)

17877.492979452054

### Approach 2: Label Encoding using `LabelEncoder`.

In [49]:
from sklearn.preprocessing import LabelEncoder

label_train_x = train_x.copy()
label_valid_x = valid_x.copy()

label_encoder = LabelEncoder()
for col in categorical_columns:
    label_train_x[col] = label_encoder.fit_transform(train_x[col])
    label_valid_x[col] = label_encoder.transform(valid_x[col])

label_train_x.head(3)

ValueError: ignored