<a href="https://colab.research.google.com/github/chrismarkella/Kaggle-access-from-Google-Colab/blob/master/machine_learning_pipeline_sandbox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get -qq install tree

Selecting previously unselected package tree.
(Reading database ... 135004 files and directories currently installed.)
Preparing to unpack .../tree_1.7.0-5_amd64.deb ...
Unpacking tree (1.7.0-5) ...
Setting up tree (1.7.0-5) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [0]:
import os

import numpy as np
import pandas as pd

from getpass import getpass 

In [3]:
def access_kaggle():
    """
    Access Kaggle from Google Colab.
    If the /root/.kaggle does not exist then prompt for
    the username and for the Kaggle API key.
    Creates the kaggle.json access file in the /root/.kaggle/ folder. 
    """
    KAGGLE_ROOT = os.path.join('/root', '.kaggle')
    KAGGLE_PATH = os.path.join(KAGGLE_ROOT, 'kaggle.json')

    if '.kaggle' not in os.listdir(path='/root'):
        user = getpass(prompt='Kaggle username: ')
        key  = getpass(prompt='Kaggle API key: ')
        
        !mkdir $KAGGLE_ROOT
        !touch $KAGGLE_PATH
        !chmod 666 $KAGGLE_PATH
        with open(KAGGLE_PATH, mode='w') as f:
            f.write('{"username":"%s", "key":"%s"}' %(user, key))
            f.close()
        !chmod 600 $KAGGLE_PATH
        del user
        del key
        success_msg = "Kaggle is successfully set up. Good to go."
        print(f'{success_msg}')

access_kaggle()


Kaggle username: ··········
Kaggle API key: ··········
Kaggle is successfully set up. Good to go.


In [4]:
!kaggle competitions download -c home-data-for-ml-course -p datasets/ml-course
!tree -sh ./
!cat -n datasets/ml-course/train.csv|head -2
df = pd.read_csv('datasets/ml-course/train.csv', sep=',', index_col=0)

df.columns = df.columns.map(lambda c: c.lower())
df.columns

Downloading test.csv.gz to datasets/ml-course
  0% 0.00/82.0k [00:00<?, ?B/s]
100% 82.0k/82.0k [00:00<00:00, 42.4MB/s]
Downloading train.csv to datasets/ml-course
  0% 0.00/450k [00:00<?, ?B/s]
100% 450k/450k [00:00<00:00, 53.9MB/s]
Downloading data_description.txt to datasets/ml-course
  0% 0.00/13.1k [00:00<?, ?B/s]
100% 13.1k/13.1k [00:00<00:00, 11.9MB/s]
Downloading sample_submission.csv.gz to datasets/ml-course
  0% 0.00/15.3k [00:00<?, ?B/s]
100% 15.3k/15.3k [00:00<00:00, 13.3MB/s]
Downloading sample_submission.csv to datasets/ml-course
  0% 0.00/31.2k [00:00<?, ?B/s]
100% 31.2k/31.2k [00:00<00:00, 30.3MB/s]
Downloading test.csv to datasets/ml-course
  0% 0.00/441k [00:00<?, ?B/s]
100% 441k/441k [00:00<00:00, 61.5MB/s]
Downloading train.csv.gz to datasets/ml-course
  0% 0.00/89.2k [00:00<?, ?B/s]
100% 89.2k/89.2k [00:00<00:00, 92.9MB/s]
./
├── [4.0K]  datasets
│   └── [4.0K]  ml-course
│       ├── [ 13K]  data_description.txt
│       ├── [ 31K]  sample_submission.csv
│       ├── 

Index(['mssubclass', 'mszoning', 'lotfrontage', 'lotarea', 'street', 'alley',
       'lotshape', 'landcontour', 'utilities', 'lotconfig', 'landslope',
       'neighborhood', 'condition1', 'condition2', 'bldgtype', 'housestyle',
       'overallqual', 'overallcond', 'yearbuilt', 'yearremodadd', 'roofstyle',
       'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype', 'masvnrarea',
       'exterqual', 'extercond', 'foundation', 'bsmtqual', 'bsmtcond',
       'bsmtexposure', 'bsmtfintype1', 'bsmtfinsf1', 'bsmtfintype2',
       'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'heating', 'heatingqc',
       'centralair', 'electrical', '1stflrsf', '2ndflrsf', 'lowqualfinsf',
       'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath', 'halfbath',
       'bedroomabvgr', 'kitchenabvgr', 'kitchenqual', 'totrmsabvgrd',
       'functional', 'fireplaces', 'fireplacequ', 'garagetype', 'garageyrblt',
       'garagefinish', 'garagecars', 'garagearea', 'garagequal', 'garagecond',
       'paveddrive', 'wo

In [5]:
df.shape

(1460, 80)

In [7]:
df.dropna(axis='index', subset=['saleprice'], inplace=True)
df.shape

(1460, 80)

In [8]:
y = df.saleprice

X = df.drop(['saleprice'], axis='columns')
X.shape

(1460, 79)

In [10]:
from sklearn.model_selection import train_test_split

train_x_full, valid_x_full, train_y, valid_y = train_test_split(X, y, test_size=0.2,
                                                                random_state=42)

train_x_full.shape, valid_x_full.shape

((1168, 79), (292, 79))

In [13]:
categorical_columns = [col for col in train_x_full.columns if
                        train_x_full[col].dtype == 'object' and
                        train_x_full[col].nunique() < 10]
categorical_columns

['mszoning',
 'street',
 'alley',
 'lotshape',
 'landcontour',
 'utilities',
 'lotconfig',
 'landslope',
 'condition1',
 'condition2',
 'bldgtype',
 'housestyle',
 'roofstyle',
 'roofmatl',
 'masvnrtype',
 'exterqual',
 'extercond',
 'foundation',
 'bsmtqual',
 'bsmtcond',
 'bsmtexposure',
 'bsmtfintype1',
 'bsmtfintype2',
 'heating',
 'heatingqc',
 'centralair',
 'electrical',
 'kitchenqual',
 'functional',
 'fireplacequ',
 'garagetype',
 'garagefinish',
 'garagequal',
 'garagecond',
 'paveddrive',
 'poolqc',
 'fence',
 'miscfeature',
 'saletype',
 'salecondition']

In [14]:
numerical_columns = [col for col in train_x_full.columns if
                        train_x_full[col].dtype in ['int64', 'float64']]
numerical_columns

['mssubclass',
 'lotfrontage',
 'lotarea',
 'overallqual',
 'overallcond',
 'yearbuilt',
 'yearremodadd',
 'masvnrarea',
 'bsmtfinsf1',
 'bsmtfinsf2',
 'bsmtunfsf',
 'totalbsmtsf',
 '1stflrsf',
 '2ndflrsf',
 'lowqualfinsf',
 'grlivarea',
 'bsmtfullbath',
 'bsmthalfbath',
 'fullbath',
 'halfbath',
 'bedroomabvgr',
 'kitchenabvgr',
 'totrmsabvgrd',
 'fireplaces',
 'garageyrblt',
 'garagecars',
 'garagearea',
 'wooddecksf',
 'openporchsf',
 'enclosedporch',
 '3ssnporch',
 'screenporch',
 'poolarea',
 'miscval',
 'mosold',
 'yrsold']

In [15]:
selected_columns = categorical_columns + numerical_columns
train_x = train_x_full[selected_columns].copy()
valid_x = valid_x_full[selected_columns].copy()

train_x.shape, valid_x.shape

((1168, 76), (292, 76))

In [16]:
train_x.head(3)

Unnamed: 0_level_0,mszoning,street,alley,lotshape,landcontour,utilities,lotconfig,landslope,condition1,condition2,bldgtype,housestyle,roofstyle,roofmatl,masvnrtype,exterqual,extercond,foundation,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfintype2,heating,heatingqc,centralair,electrical,kitchenqual,functional,fireplacequ,garagetype,garagefinish,garagequal,garagecond,paveddrive,poolqc,fence,miscfeature,saletype,salecondition,mssubclass,lotfrontage,lotarea,overallqual,overallcond,yearbuilt,yearremodadd,masvnrarea,bsmtfinsf1,bsmtfinsf2,bsmtunfsf,totalbsmtsf,1stflrsf,2ndflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,totrmsabvgrd,fireplaces,garageyrblt,garagecars,garagearea,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,miscval,mosold,yrsold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1
255,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,1Fam,1Story,Gable,CompShg,,TA,Gd,CBlock,TA,TA,No,Rec,Unf,GasA,TA,Y,SBrkr,TA,Typ,,Attchd,RFn,TA,TA,Y,,,,WD,Normal,20,70.0,8400,5,6,1957,1957,0.0,922,0,392,1314,1314,0,0,1314,1,0,1,0,3,1,5,0,1957.0,1,294,250,0,0,0,0,0,0,6,2010
1067,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Norm,Norm,1Fam,2Story,Gable,CompShg,,Gd,TA,PConc,Gd,TA,No,Unf,Unf,GasA,Gd,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal,60,59.0,7837,6,7,1993,1994,0.0,0,0,799,799,799,772,0,1571,0,0,2,1,3,1,7,1,1993.0,2,380,0,40,0,0,0,0,0,5,2009
639,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Feedr,Norm,1Fam,1Story,Gable,CompShg,,TA,TA,CBlock,Fa,TA,No,Unf,Unf,GasA,Gd,Y,FuseA,TA,Typ,,,,,,P,,MnPrv,,WD,Normal,30,67.0,8777,5,7,1910,1950,0.0,0,0,796,796,796,0,0,796,0,0,1,0,2,1,4,0,,0,0,328,0,164,0,0,0,0,5,2008


In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

numercial_transformer = SimpleImputer(strategy='constant', fill_value=0)

categorical_transformer = Pipeline(
    steps=[
           ('imputer', SimpleImputer(strategy='most_frequent')),
           ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
                  ('num', numercial_transformer, numerical_columns),
                  ('cat', categorical_transformer, categorical_columns)
    ])

model = RandomForestRegressor(n_estimators=100, random_state=42)

clf = Pipeline(steps=[
                      ('preprocessor', preprocessor),
                      ('model', model)
])

clf.fit(train_x, train_y)

preds= clf.predict(valid_x)

print(f'{mean_absolute_error(y_true=valid_y, y_pred=preds)}')

17678.294143835617
