# Discretization
Many times we need to convert continuous attributes into multiple intervals, so we can reduce the data, or remove some variance. This process is called discretization.

## Setup

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/content/sample_data/california_housing_train.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [3]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Discretize population

In [8]:
df['popular'] = np.select([df['population'] <1429.573941, df['population'] >= 1429.573941], ['not popular', 'popular'])
df['popular']

0        not popular
1        not popular
2        not popular
3        not popular
4        not popular
            ...     
16995    not popular
16996    not popular
16997    not popular
16998    not popular
16999    not popular
Name: popular, Length: 17000, dtype: object

In [9]:
df['popular'].value_counts()

not popular    10862
popular         6138
Name: popular, dtype: int64

## Discretize rooms

In [13]:
conditions = [
    (df['total_rooms'] < 1462) & (df['total_bedrooms'] < 297),
    (df['total_rooms'] > 3151) & (df['total_bedrooms'] > 648),
    (df['total_rooms'] < 2127) & (df['total_bedrooms'] > 434),
    (df['total_rooms'] > 2127) & (df['total_bedrooms'] < 434),
]

values = ['LL', 'HH', 'LH', 'HL'] 
df['rooms'] = np.select(conditions, values)
df['rooms']

0        HH
1        HH
2        LL
3         0
4         0
         ..
16995    HL
16996     0
16997     0
16998     0
16999     0
Name: rooms, Length: 17000, dtype: object

In [14]:
df['rooms'].value_counts()

0     7970
LL    3424
HH    3394
HL    1110
LH    1102
Name: rooms, dtype: int64

## Discretize house value

In [4]:
def house_value(value):
    if value < 119400:
        return "Low"
    elif value > 265000:
        return "High"
    else:
        return "Medium"

In [5]:
df['house_value_category'] = df['median_house_value'].apply(house_value)
df['house_value_category'] 

0        Low
1        Low
2        Low
3        Low
4        Low
        ... 
16995    Low
16996    Low
16997    Low
16998    Low
16999    Low
Name: house_value_category, Length: 17000, dtype: object

In [6]:
df['house_value_category'].value_counts()

Medium    8510
High      4247
Low       4243
Name: house_value_category, dtype: int64