<a href="https://colab.research.google.com/github/chrismarkella/Kaggle-access-from-Google-Colab/blob/master/machine_learning_dropping_or_imputing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install tree

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-430
Use 'apt autoremove' to remove it.
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 7 not upgraded.
Need to get 40.7 kB of archives.
After this operation, 105 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]
Fetched 40.7 kB in 0s (138 kB/s)
Selecting previously unselected package tree.
(Reading database ... 135004 files and directories currently installed.)
Preparing to unpack .../tree_1.7.0-5_amd64.deb ...
Unpacking tree (1.7.0-5) ...
Setting up tree (1.7.0-5) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [0]:
import os

import numpy as np
import pandas as pd

from getpass import getpass 

In [3]:
def access_kaggle():
    """
    Access Kaggle from Google Colab.
    If the /root/.kaggle does not exist then prompt for
    the username and for the Kaggle API key.
    Creates the kaggle.json access file in the /root/.kaggle/ folder. 
    """
    KAGGLE_ROOT = os.path.join('/root', '.kaggle')
    KAGGLE_PATH = os.path.join(KAGGLE_ROOT, 'kaggle.json')

    if '.kaggle' not in os.listdir(path='/root'):
        user = getpass(prompt='Kaggle username: ')
        key  = getpass(prompt='Kaggle API key: ')
        
        !mkdir $KAGGLE_ROOT
        !touch $KAGGLE_PATH
        !chmod 666 $KAGGLE_PATH
        with open(KAGGLE_PATH, mode='w') as f:
            f.write('{"username":"%s", "key":"%s"}' %(user, key))
            f.close()
        !chmod 600 $KAGGLE_PATH
        del user
        del key
        success_msg = "Kaggle is successfully set up. Good to go."
        print(f'{success_msg}')

access_kaggle()


Kaggle username: ··········
Kaggle API key: ··········
Kaggle is successfully set up. Good to go.


In [4]:
!kaggle datasets download gunjanpathak/melb-data --unzip
!tree -sh ./
!cat -n melb_data.csv|head -2
df = pd.read_csv('melb_data.csv', sep=',', index_col=0)

df.columns = df.columns.map(lambda c: c.lower())
df.columns

Downloading melb-data.zip to /content
  0% 0.00/614k [00:00<?, ?B/s]
100% 614k/614k [00:00<00:00, 56.6MB/s]
./
├── [2.7M]  melb_data.csv
└── [4.0K]  sample_data
    ├── [1.7K]  anscombe.json
    ├── [294K]  california_housing_test.csv
    ├── [1.6M]  california_housing_train.csv
    ├── [ 17M]  mnist_test.csv
    ├── [ 35M]  mnist_train_small.csv
    └── [ 930]  README.md

1 directory, 7 files
     1	,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
     2	1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0


Index(['suburb', 'address', 'rooms', 'type', 'price', 'method', 'sellerg',
       'date', 'distance', 'postcode', 'bedroom2', 'bathroom', 'car',
       'landsize', 'buildingarea', 'yearbuilt', 'councilarea', 'lattitude',
       'longtitude', 'regionname', 'propertycount'],
      dtype='object')

In [5]:
df.isnull().sum()

suburb               0
address              0
rooms                0
type                 0
price                0
method               0
sellerg              0
date                 0
distance             1
postcode             1
bedroom2          3469
bathroom          3471
car               3576
landsize          4793
buildingarea     10634
yearbuilt         9438
councilarea       6163
lattitude         3332
longtitude        3332
regionname           1
propertycount        1
dtype: int64

In [6]:
df.dtypes

suburb            object
address           object
rooms              int64
type              object
price            float64
method            object
sellerg           object
date              object
distance         float64
postcode         float64
bedroom2         float64
bathroom         float64
car              float64
landsize         float64
buildingarea     float64
yearbuilt        float64
councilarea       object
lattitude        float64
longtitude       float64
regionname        object
propertycount    float64
dtype: object

In [0]:
from sklearn.model_selection import train_test_split


In [0]:
y = df.price


In [0]:
predictors = df.drop(['price'], axis='columns')

X = predictors.select_dtypes(exclude=['object'])

In [12]:
X.columns

Index(['rooms', 'distance', 'postcode', 'bedroom2', 'bathroom', 'car',
       'landsize', 'buildingarea', 'yearbuilt', 'lattitude', 'longtitude',
       'propertycount'],
      dtype='object')

In [14]:
X.dtypes

rooms              int64
distance         float64
postcode         float64
bedroom2         float64
bathroom         float64
car              float64
landsize         float64
buildingarea     float64
yearbuilt        float64
lattitude        float64
longtitude       float64
propertycount    float64
dtype: object

In [0]:
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2,
                                                      random_state=0)

In [0]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(train_x, valid_x, train_y, valid_y):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(train_x, train_y)
    preds = model.predict(valid_x)
    return mean_absolute_error(y_true=valid_y, y_pred=preds)
    

In [21]:
[cols for cols in train_x.columns if not train_x[cols].isnull().any()]

['rooms']

In [22]:
[cols for cols in train_x.columns if train_x[cols].isnull().any()]

['distance',
 'postcode',
 'bedroom2',
 'bathroom',
 'car',
 'landsize',
 'buildingarea',
 'yearbuilt',
 'lattitude',
 'longtitude',
 'propertycount']

In [23]:
columns_with_NaN = [cols for cols in train_x.columns if train_x[cols].isnull().any()]
columns_with_NaN

['distance',
 'postcode',
 'bedroom2',
 'bathroom',
 'car',
 'landsize',
 'buildingarea',
 'yearbuilt',
 'lattitude',
 'longtitude',
 'propertycount']

In [24]:
train_x.drop(columns_with_NaN, axis='columns')

Unnamed: 0,rooms
3349,4
2686,3
6065,2
11346,3
13474,2
...,...
11849,2
16889,2
12649,3
13887,3


In [25]:
reduced_train_x = train_x.drop(columns_with_NaN, axis='columns')
reduced_valid_x = valid_x.drop(columns_with_NaN, axis='columns')
print(f'{reduced_train_x.head(2)}')
print(f'{reduced_valid_x.head(2)}')


      rooms
3349      4
2686      3
       rooms
3932       1
11811      2


In [26]:
score_dataset(reduced_train_x, reduced_valid_x, train_y, valid_y)

389006.8120658193

In [0]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()

In [29]:
train_x.head(2)

Unnamed: 0,rooms,distance,postcode,bedroom2,bathroom,car,landsize,buildingarea,yearbuilt,lattitude,longtitude,propertycount
3349,4,7.8,3058.0,4.0,2.0,1.0,381.0,,1938.0,-37.7337,144.9548,11204.0
2686,3,7.8,3124.0,3.0,1.0,1.0,544.0,160.0,1930.0,-37.8436,145.0581,8920.0


In [0]:
imputed_train_x = pd.DataFrame(my_imputer.fit_transform(train_x))
imputed_valid_x = pd.DataFrame(my_imputer.fit_transform(valid_x))

In [31]:
imputed_train_x.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,4.0,7.8,3058.0,4.0,2.0,1.0,381.0,152.120627,1938.0,-37.7337,144.9548,11204.0
1,3.0,7.8,3124.0,3.0,1.0,1.0,544.0,160.0,1930.0,-37.8436,145.0581,8920.0


In [32]:
train_x.columns

Index(['rooms', 'distance', 'postcode', 'bedroom2', 'bathroom', 'car',
       'landsize', 'buildingarea', 'yearbuilt', 'lattitude', 'longtitude',
       'propertycount'],
      dtype='object')

In [33]:
imputed_train_x.columns

RangeIndex(start=0, stop=12, step=1)

In [0]:
imputed_train_x.columns = train_x.columns
imputed_valid_x.columns = valid_x.columns

In [35]:
score_dataset(imputed_train_x, imputed_valid_x, train_y, valid_y)

204890.48541963196

In [0]:
train_x_plus = train_x.copy(deep=True)
valid_x_plus = valid_x.copy(deep=True)

In [38]:
for col in columns_with_NaN:
    print(col + '_was_missing')

distance_was_missing
postcode_was_missing
bedroom2_was_missing
bathroom_was_missing
car_was_missing
landsize_was_missing
buildingarea_was_missing
yearbuilt_was_missing
lattitude_was_missing
longtitude_was_missing
propertycount_was_missing


In [41]:
for col in columns_with_NaN:
    print(train_x_plus[col].isnull()[:2])

3349    False
2686    False
Name: distance, dtype: bool
3349    False
2686    False
Name: postcode, dtype: bool
3349    False
2686    False
Name: bedroom2, dtype: bool
3349    False
2686    False
Name: bathroom, dtype: bool
3349    False
2686    False
Name: car, dtype: bool
3349    False
2686    False
Name: landsize, dtype: bool
3349     True
2686    False
Name: buildingarea, dtype: bool
3349    False
2686    False
Name: yearbuilt, dtype: bool
3349    False
2686    False
Name: lattitude, dtype: bool
3349    False
2686    False
Name: longtitude, dtype: bool
3349    False
2686    False
Name: propertycount, dtype: bool


In [42]:
for col in columns_with_NaN:
    train_x_plus[col + '_was missing'] = train_x_plus[col].isnull()

train_x_plus.head(3)

Unnamed: 0,rooms,distance,postcode,bedroom2,bathroom,car,landsize,buildingarea,yearbuilt,lattitude,longtitude,propertycount,distance_was missing,postcode_was missing,bedroom2_was missing,bathroom_was missing,car_was missing,landsize_was missing,buildingarea_was missing,yearbuilt_was missing,lattitude_was missing,longtitude_was missing,propertycount_was missing
3349,4,7.8,3058.0,4.0,2.0,1.0,381.0,,1938.0,-37.7337,144.9548,11204.0,False,False,False,False,False,False,True,False,False,False,False
2686,3,7.8,3124.0,3.0,1.0,1.0,544.0,160.0,1930.0,-37.8436,145.0581,8920.0,False,False,False,False,False,False,False,False,False,False,False
6065,2,5.6,3101.0,2.0,1.0,1.0,121.0,,,-37.8126,145.0534,10331.0,False,False,False,False,False,False,True,True,False,False,False


In [43]:
for col in columns_with_NaN:
    valid_x_plus[col + '_was missing'] = valid_x_plus[col].isnull()

valid_x_plus.head(3)

Unnamed: 0,rooms,distance,postcode,bedroom2,bathroom,car,landsize,buildingarea,yearbuilt,lattitude,longtitude,propertycount,distance_was missing,postcode_was missing,bedroom2_was missing,bathroom_was missing,car_was missing,landsize_was missing,buildingarea_was missing,yearbuilt_was missing,lattitude_was missing,longtitude_was missing,propertycount_was missing
3932,1,7.7,3184.0,1.0,1.0,1.0,0.0,42.0,1970.0,-37.8784,144.9864,8989.0,False,False,False,False,False,False,False,False,False,False,False
11811,2,8.2,3012.0,2.0,1.0,1.0,308.0,,,-37.7908,144.8717,5058.0,False,False,False,False,False,False,True,True,False,False,False
6615,5,8.7,3032.0,5.0,3.0,2.0,98.0,,,-37.7843,144.8939,4918.0,False,False,False,False,False,False,True,True,False,False,False


In [0]:
my_imputer = SimpleImputer()

imputed_train_x_plus = pd.DataFrame(my_imputer.fit_transform(train_x_plus))
imputed_valid_x_plus = pd.DataFrame(my_imputer.transform(valid_x_plus))

In [46]:
imputed_train_x_plus.columns

RangeIndex(start=0, stop=23, step=1)

In [47]:
imputed_train_x_plus.columns = train_x_plus.columns
imputed_valid_x_plus.columns = valid_x_plus.columns

score_dataset(train_x=imputed_train_x_plus,
              valid_x=imputed_valid_x_plus,
              train_y=train_y, valid_y=valid_y)

198846.25410329842

In [48]:
train_x.shape

(14716, 12)