In [1]:
import pandas as pd
import numpy as np

## Loading data

In [2]:
data = pd.read_csv('housing.csv')

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
data = data.fillna(value = 124)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


## Approach 1: Manual Splitting with slicing

In [7]:
# input
x = data.drop('median_house_value', axis = 1)

In [8]:
x.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY


In [9]:
x.shape

(20640, 9)

In [13]:
y = data['median_house_value']

In [14]:
y.shape

(20640,)

#### Further splitting the input and output data to training and testing part

In [15]:
x_train = x.loc[0:20000, :]
x_test = x.loc[20000:, ]

y_train = y.loc[0:20000]
y_test = y.loc[20000:]

In [16]:
print(x_train.shape)

print(y_train.shape)
print(x_test.shape)

print(y_test.shape)

(20001, 9)
(20001,)
(640, 9)
(640,)


In [24]:
x.shape

(20640, 9)

## Splitting the data using sklearn

In [27]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 23)

In [26]:
x_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
5817,-118.24,34.13,45.0,1971.0,439.0,1245.0,430.0,4.0272,<1H OCEAN
16856,-122.41,37.62,39.0,3119.0,758.0,1807.0,696.0,3.2216,NEAR OCEAN
5494,-118.48,33.96,16.0,895.0,181.0,237.0,149.0,12.0088,<1H OCEAN
17053,-122.24,37.47,40.0,1504.0,270.0,689.0,287.0,6.1244,NEAR OCEAN
15836,-122.44,37.75,46.0,1519.0,291.0,573.0,289.0,4.2667,NEAR BAY
...,...,...,...,...,...,...,...,...,...
6175,-117.93,34.08,36.0,1597.0,285.0,901.0,272.0,4.3947,<1H OCEAN
9704,-121.64,36.66,24.0,3174.0,506.0,1466.0,535.0,5.2285,<1H OCEAN
11190,-117.94,33.82,29.0,1422.0,409.0,1057.0,390.0,2.3347,<1H OCEAN
9256,-120.05,36.96,37.0,1000.0,261.0,1092.0,233.0,1.4267,INLAND


In [20]:
x_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
16187,-121.30,37.96,24.0,1212.0,366.0,1202.0,343.0,1.7875,INLAND
1477,-122.04,37.97,21.0,6445.0,1839.0,3621.0,1735.0,2.5841,NEAR BAY
12721,-121.37,38.58,37.0,2839.0,390.0,1006.0,400.0,7.3343,INLAND
14741,-117.06,32.58,17.0,2724.0,567.0,2213.0,554.0,3.8529,NEAR OCEAN
12097,-117.28,33.89,33.0,6982.0,1371.0,5650.0,1195.0,2.5379,INLAND
...,...,...,...,...,...,...,...,...,...
15740,-122.43,37.78,24.0,2037.0,696.0,1371.0,585.0,0.9355,NEAR BAY
16222,-121.35,37.96,21.0,1343.0,183.0,462.0,193.0,5.8995,INLAND
7904,-118.06,33.85,16.0,4851.0,726.0,2527.0,704.0,6.0142,<1H OCEAN
19206,-122.72,38.47,29.0,1706.0,415.0,990.0,394.0,1.9932,<1H OCEAN


In [21]:
print(x_train.shape)

print(y_train.shape)

print(x_test.shape)

print(y_test.shape)

(16512, 9)
(16512,)
(4128, 9)
(4128,)
