<a href="https://colab.research.google.com/github/dasarisasidhar/Data_analysis/blob/master/data_splitting__training_testing_validation_set_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

In [4]:
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")
california_housing_dataframe.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [0]:
def preprocess_features(california_housing_dataframe):
  """Prepares input features from California housing data set.

    Args:
      california_housing_dataframe: A Pandas DataFrame expected to contain data
        from the California housing data set.
    Returns:
      A DataFrame that contains the features to be used for the model, including
      synthetic features.
  """
  selected_features = california_housing_dataframe[
    ["latitude",
     "longitude",
     "housing_median_age",
     "total_rooms",
     "total_bedrooms",
     "population",
     "households",
     "median_income"]]
  processed_features = selected_features.copy()
  # Create a synthetic feature.
  processed_features["rooms_per_person"] = (
    california_housing_dataframe["total_rooms"] /
    california_housing_dataframe["population"])
  return processed_features

def preprocess_targets(california_housing_dataframe):
  """Prepares target features (i.e., labels) from California housing data set.

  Args:
    california_housing_dataframe: A Pandas DataFrame expected to contain data
      from the California housing data set.
  Returns:
    A DataFrame that contains the target feature.
  """
  output_targets = pd.DataFrame()
  # Scale the target to be in units of thousands of dollars.
  output_targets["median_house_value"] = (
    california_housing_dataframe["median_house_value"] / 1000.0)
  return output_targets

In [8]:
training_examples = preprocess_features(california_housing_dataframe.head(12000))
training_examples.describe()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,34.614578,-118.470274,27.468333,2655.682333,547.057167,1476.007,505.38425,3.795047,1.940185
std,1.62597,1.243589,12.06179,2258.147574,434.314754,1174.280904,391.71534,1.851925,1.327142
min,32.54,-121.39,1.0,2.0,2.0,3.0,2.0,0.4999,0.018065
25%,33.82,-118.94,17.0,1451.75,299.0,815.0,283.0,2.5172,1.420007
50%,34.05,-118.21,28.0,2113.5,438.0,1207.0,411.0,3.46225,1.880875
75%,34.44,-117.79,36.0,3146.0,653.0,1777.0,606.0,4.644625,2.25883
max,41.82,-114.31,52.0,37937.0,5471.0,35682.0,5189.0,15.0001,55.222222


In [9]:
training_targets = preprocess_targets(california_housing_dataframe.head(12000))
training_targets.describe()

Unnamed: 0,median_house_value
count,12000.0
mean,198.037593
std,111.857499
min,14.999
25%,117.1
50%,170.5
75%,244.4
max,500.001


In [10]:
validation_examples = preprocess_features(california_housing_dataframe.tail(5000))
validation_examples.describe()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_person
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,38.050778,-122.18251,31.2798,2614.8214,521.0596,1318.1346,491.2324,4.096053,2.078781
std,0.92303,0.480337,13.38939,1979.620397,388.452096,1073.74575,366.523912,2.021218,0.638113
min,36.14,-124.35,1.0,8.0,1.0,8.0,1.0,0.4999,0.135721
25%,37.49,-122.4,20.0,1481.0,292.0,731.0,278.0,2.6909,1.74919
50%,37.79,-122.14,31.0,2164.0,424.0,1074.0,403.0,3.72845,2.066278
75%,38.37,-121.91,42.0,3161.25,635.0,1590.25,603.0,5.0641,2.375372
max,41.95,-121.39,52.0,32627.0,6445.0,28566.0,6082.0,15.0001,18.255319


In [11]:
validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))
validation_targets.describe()

Unnamed: 0,median_house_value
count,5000.0
mean,229.532879
std,122.520063
min,14.999
25%,130.4
50%,213.0
75%,303.15
max,500.001
