<a href="https://colab.research.google.com/github/chrismarkella/Kaggle-access-from-Google-Colab/blob/master/machine_learning_categorical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get -qq install tree

Selecting previously unselected package tree.
(Reading database ... 135004 files and directories currently installed.)
Preparing to unpack .../tree_1.7.0-5_amd64.deb ...
Unpacking tree (1.7.0-5) ...
Setting up tree (1.7.0-5) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [0]:
import os

import numpy as np
import pandas as pd

from getpass import getpass 

In [3]:
def access_kaggle():
    """
    Access Kaggle from Google Colab.
    If the /root/.kaggle does not exist then prompt for
    the username and for the Kaggle API key.
    Creates the kaggle.json access file in the /root/.kaggle/ folder. 
    """
    KAGGLE_ROOT = os.path.join('/root', '.kaggle')
    KAGGLE_PATH = os.path.join(KAGGLE_ROOT, 'kaggle.json')

    if '.kaggle' not in os.listdir(path='/root'):
        user = getpass(prompt='Kaggle username: ')
        key  = getpass(prompt='Kaggle API key: ')
        
        !mkdir $KAGGLE_ROOT
        !touch $KAGGLE_PATH
        !chmod 666 $KAGGLE_PATH
        with open(KAGGLE_PATH, mode='w') as f:
            f.write('{"username":"%s", "key":"%s"}' %(user, key))
            f.close()
        !chmod 600 $KAGGLE_PATH
        del user
        del key
        success_msg = "Kaggle is successfully set up. Good to go."
        print(f'{success_msg}')

access_kaggle()


Kaggle username: ··········
Kaggle API key: ··········
Kaggle is successfully set up. Good to go.


In [4]:
!kaggle datasets download gunjanpathak/melb-data --unzip
!tree -sh ./
!cat -n melb_data.csv|head -2
df = pd.read_csv('melb_data.csv', sep=',', index_col=0)

df.columns = df.columns.map(lambda c: c.lower())
df.columns

Downloading melb-data.zip to /content
  0% 0.00/614k [00:00<?, ?B/s]
100% 614k/614k [00:00<00:00, 41.4MB/s]
./
├── [2.7M]  melb_data.csv
└── [4.0K]  sample_data
    ├── [1.7K]  anscombe.json
    ├── [294K]  california_housing_test.csv
    ├── [1.6M]  california_housing_train.csv
    ├── [ 17M]  mnist_test.csv
    ├── [ 35M]  mnist_train_small.csv
    └── [ 930]  README.md

1 directory, 7 files
     1	,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
     2	1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0


Index(['suburb', 'address', 'rooms', 'type', 'price', 'method', 'sellerg',
       'date', 'distance', 'postcode', 'bedroom2', 'bathroom', 'car',
       'landsize', 'buildingarea', 'yearbuilt', 'councilarea', 'lattitude',
       'longtitude', 'regionname', 'propertycount'],
      dtype='object')

In [5]:
df.isnull().sum()

suburb               0
address              0
rooms                0
type                 0
price                0
method               0
sellerg              0
date                 0
distance             1
postcode             1
bedroom2          3469
bathroom          3471
car               3576
landsize          4793
buildingarea     10634
yearbuilt         9438
councilarea       6163
lattitude         3332
longtitude        3332
regionname           1
propertycount        1
dtype: int64

In [0]:
y = df.price

X = df.drop(['price'], axis='columns')

In [0]:
from sklearn.model_selection import train_test_split

train_x_full, valid_x_full, train_y, valid_y = train_test_split(X, y, test_size=0.2,
                                                                random_state=0)

In [10]:
len(X), len(train_x_full), len(valid_x_full)

(18396, 14716, 3680)

In [14]:
columns_with_NaN = [col for col in train_x_full.columns if train_x_full[col].isnull().any()]
columns_with_NaN

['distance',
 'postcode',
 'bedroom2',
 'bathroom',
 'car',
 'landsize',
 'buildingarea',
 'yearbuilt',
 'councilarea',
 'lattitude',
 'longtitude',
 'regionname',
 'propertycount']

In [13]:
train_x_full.isnull().sum()

suburb              0
address             0
rooms               0
type                0
method              0
sellerg             0
date                0
distance            1
postcode            1
bedroom2         2779
bathroom         2780
car              2860
landsize         3829
buildingarea     8516
yearbuilt        7528
councilarea      4900
lattitude        2675
longtitude       2675
regionname          1
propertycount       1
dtype: int64

In [15]:
# Dropping columns with NaN.

train_x_full.drop(columns_with_NaN, axis='columns', inplace=True)
valid_x_full.drop(columns_with_NaN, axis='columns', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
train_x_full.columns

Index(['suburb', 'address', 'rooms', 'type', 'method', 'sellerg', 'date'], dtype='object')

In [18]:
train_x_full.dtypes

suburb     object
address    object
rooms       int64
type       object
method     object
sellerg    object
date       object
dtype: object

In [19]:
numerical_columns = [col for col in train_x_full.columns if train_x_full[col].dtype in ['int64', 'float64']]
numerical_columns

['rooms']

In [20]:
[col for col in train_x_full.columns if train_x_full[col].dtype == 'object']

['suburb', 'address', 'type', 'method', 'sellerg', 'date']

In [21]:
[f'{col}:{train_x_full[col].nunique()}' for col in train_x_full.columns]

['suburb:327',
 'address:14547',
 'rooms:11',
 'type:3',
 'method:5',
 'sellerg:285',
 'date:58']

In [24]:
low_cardinality_columns = [col for col in train_x_full.columns if train_x_full[col].dtype == 'object' and train_x_full[col].nunique() < 10]
low_cardinality_columns

['type', 'method']