In [8]:
import os
import yaml
import pandas as pd

In [9]:
# 2. Use the sample exploration config file to define a YAML configuration (config) file
path_to_yaml = os.path.join(os.getcwd(),'data_exploration_config.yml')
try:
    with open (path_to_yaml, 'r') as c_file:
        config = yaml.safe_load(c_file)
except Exception as e:
    print('Error reading the config file')
 

In [13]:
# 3. Add the code to your Jupyter notebook to ingest the config file you defined in the previous step 
# and create Python variables for the parameters you defined in the config file
load_from_scratch = config['general']['load_from_scratch']
save_raw_dataframe = config['general']['save_raw_dataframe']
save_transformed_dataframe = config['general']['save_transformed_dataframe']
remove_bad_values = config['general']['remove_bad_values']
categorical = config['columns']['categorical']
continuous = config['columns']['continuous']
date = config['columns']['date']
text = config['columns']['text']
excluded = config['columns']['excluded']
max_lat = config['bounding_box']['max_lat']
min_long = config['bounding_box']['min_long']
min_lat = config['bounding_box']['min_lat']
max_lat = config['newark_bounding_box']['max_lat']
min_long = config['newark_bounding_box']['min_long']
min_lat = config['newark_bounding_box']['min_lat']
geo_columns = config['geo_columns']
input_csv = config['file_names']['input_csv']
pickled_input_dataframe = config['file_names']['pickled_input_dataframe']
pickled_output_dataframe = config['file_names']['pickled_output_dataframe']

In [32]:
# 4. Load the Airbnb dataset into a pandas DataFrame
df = pd.read_csv(input_csv,parse_dates=[','.join(date)],dtype={key: 'category' for key in categorical})

In [54]:
# 5. Examine the DataFrame and determine which of the columns belong in one of the following categories
print('Categorical columns:',str(categorical))
print('Continuous columns:',str(continuous))
print('Text columns:',str(text))

Categorical columns: ['neighbourhood_group', 'neighbourhood', 'room_type']
Continuous columns: ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'latitude', 'longitude']
Text columns: ['name', 'host_name']


In [90]:
# 6. Complete a basic assessment of the dataset
# missing values
count_missing = (len(df) - df.count())
count_missing[count_missing != 0]

name                    16
host_name               21
last_review          10052
reviews_per_month    10052
dtype: int64

In [88]:
# Invalid values
# Continuous columns should be integers instead of floats except latitude and longitude
df[continuous].agg(['min','max']).unstack()

minimum_nights                  min       1.00000
                                max    1250.00000
number_of_reviews               min       0.00000
                                max     629.00000
reviews_per_month               min       0.01000
                                max      58.50000
calculated_host_listings_count  min       1.00000
                                max     327.00000
latitude                        min      40.49979
                                max      40.91306
longitude                       min     -74.24442
                                max     -73.71299
dtype: float64

In [95]:
# The number of distinct values in each of the categorical columns
{category: df[category].nunique() for category in categorical}

{'neighbourhood_group': 5, 'neighbourhood': 221, 'room_type': 3}

In [97]:
# 7. Save the DataFrame to a pickle file
df.to_pickle(pickled_output_dataframe)