---

---

# **Data Collection**

## Objectives

* Fetch data from Kaggle and prepare it for further processes

## Inputs

*   Kaggle JSON file - authentication token 

## Outputs

* Generate Dataset: inputs/datasets/malaria_dataset

## Additional Comments | Insights | Conclusions


* No comments


---

# Import packages

In [1]:
import os
import zipfile
import pandas as pd
import stat
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split

## Change the working directory

In [2]:
load_dotenv()

True

In [3]:
zip_file_path = os.path.join(os.getenv("LOCAL_DATASET_PATH"), os.getenv("DATASET_ZIPFILE_NAME"))
housing_records_path = os.path.join(os.getenv("DATASET_FILE_PATH"), os.getenv("HOUSING_RECORDS_FILENAME"))
inherited_houses_path = os.path.join(os.getenv("DATASET_FILE_PATH"), os.getenv("INHERITED_HOUSES_FILENAME"))

Set Kaggle Dataset and Download it

In [4]:
os.chmod(os.getenv("LOCAL_DATASET_PATH"), stat.S_IWGRP)

In [5]:
! kaggle datasets download -d {os.getenv("KAGGLE_DATASET_PATH")} -p {os.getenv("LOCAL_DATASET_PATH")} --force

Downloading housing-prices-data.zip to inputs/housing_prices_data




  0%|          | 0.00/49.6k [00:00<?, ?B/s]
100%|##########| 49.6k/49.6k [00:00<00:00, 505kB/s]
100%|##########| 49.6k/49.6k [00:00<00:00, 466kB/s]


Unzip the downloaded file, delete the zip file

In [6]:
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(os.getenv("LOCAL_DATASET_PATH"))
os.remove(zip_file_path)

---

In [7]:
housing_records = pd.read_csv(housing_records_path)
assert housing_records.shape == (1460, 24)
housing_records.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,EnclosedPorch,GarageArea,GarageFinish,...,LotFrontage,MasVnrArea,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,SalePrice
0,856,854.0,3.0,No,706,GLQ,150,0.0,548,RFn,...,65.0,196.0,61,5,7,856,0.0,2003,2003,208500
1,1262,0.0,3.0,Gd,978,ALQ,284,,460,RFn,...,80.0,0.0,0,8,6,1262,,1976,1976,181500
2,920,866.0,3.0,Mn,486,GLQ,434,0.0,608,RFn,...,68.0,162.0,42,5,7,920,,2001,2002,223500
3,961,,,No,216,ALQ,540,,642,Unf,...,60.0,0.0,35,5,7,756,,1915,1970,140000
4,1145,,4.0,Av,655,GLQ,490,0.0,836,RFn,...,84.0,350.0,84,5,8,1145,,2000,2000,250000


In [8]:
inherited_houses = pd.read_csv(inherited_houses_path)
assert inherited_houses.shape == (4, 23)
inherited_houses.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,EnclosedPorch,GarageArea,GarageFinish,...,LotArea,LotFrontage,MasVnrArea,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd
0,896,0,2,No,468.0,Rec,270.0,0,730.0,Unf,...,11622,80.0,0.0,0,6,5,882.0,140,1961,1961
1,1329,0,3,No,923.0,ALQ,406.0,0,312.0,Unf,...,14267,81.0,108.0,36,6,6,1329.0,393,1958,1958
2,928,701,3,No,791.0,GLQ,137.0,0,482.0,Fin,...,13830,74.0,0.0,34,5,5,928.0,212,1997,1998
3,926,678,3,No,602.0,GLQ,324.0,0,470.0,Fin,...,9978,78.0,20.0,36,6,6,926.0,360,1998,1998


## Split train validation test set

In [9]:
housing_records_train_cv, housing_records_test = train_test_split(housing_records, test_size=0.2)
housing_records_train, housing_records_cv = train_test_split(housing_records_train_cv, test_size=0.25)
assert housing_records_train.shape[0] == housing_records.shape[0] * 0.8 * 0.75
assert housing_records_test.shape[0] == housing_records.shape[0] * 0.2
assert housing_records_cv.shape[0] == housing_records.shape[0] * 0.8 * 0.25

---

In [10]:
housing_records_train.to_csv(os.path.join(os.getenv("LOCAL_DATASET_PATH"), os.getenv("HOUSING_RECORDS_TRAIN_FILENAME")))
housing_records_test.to_csv(os.path.join(os.getenv("LOCAL_DATASET_PATH"), os.getenv("HOUSING_RECORDS_TEST_FILENAME")))
housing_records_cv.to_csv(os.path.join(os.getenv("LOCAL_DATASET_PATH"), os.getenv("HOUSING_RECORDS_CV_FILENAME")))

# Congratulations

---