# Exploring the Data
This is using NY data. Looks like inside aribnb data is same format so it can be used for other cities. This notebook is dealing with selecting columns to use.


In [1]:
import os
import io
import re
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
#read data, specify index so it's easier to join and search using loc
path = '../../data/new-york-city-airbnb-open-data/'
listings_csv = os.path.join(path,'listings.csv')

listings = pd.read_csv(listings_csv, index_col = 'id')

In [3]:
#shape
total_row = listings.shape[0]
listings.shape

(50796, 105)

In [4]:
#to get all columns, set option
pd.set_option('display.max_columns', 107)
#to get text with no truncation
pd.set_option('display.max_colwidth', -1)

In [5]:
#check if columns have lots of null (50% or more)
def less_than_50_percent(colname):
    isnull_count = listings[colname].isna().sum()
    if isnull_count/total_row > .5:
        return True


In [6]:
columns = list(listings)
remove_columns = []
for column in columns:
    remove_column_y_n = less_than_50_percent(column)
    if remove_column_y_n:
        remove_columns.append(column)

print(remove_columns)

['notes', 'thumbnail_url', 'medium_url', 'xl_picture_url', 'square_feet', 'weekly_price', 'monthly_price', 'license', 'jurisdiction_names']


In [7]:
listings.drop(remove_columns, inplace=True, axis=1)
#chek if columns are removed
listings.shape

(50796, 96)

In [8]:
#check if all records have same value
def all_same_value(colname):
    count_unique = len(listings[colname].unique())
    if count_unique == 1:
        return True

In [9]:
columns = list(listings)
remove_columns = []
for column in columns:
    remove_column_y_n = all_same_value(column)
    if remove_column_y_n:
        remove_columns.append(column)

print(remove_columns)

['scrape_id', 'experiences_offered', 'country_code', 'country', 'has_availability', 'requires_license', 'is_business_travel_ready']


In [None]:
listings.drop(remove_columns, inplace=True, axis=1)
#chek if columns are removed
listings.shape

In [50]:
#get max length
def getmaxlength(colname):
    listings['length'] = listings[colname].str.len()
    sorted = listings['length'].sort_values(ascending = False)
    maxlength = sorted.head(1).values[0]
    return maxlength

In [72]:
#run this for all object type columns and exlcuding amenities column
#if it's more than 300, then remove them
columns = list(listings)
remove_columns = []
for column in columns:
    if listings[column].dtypes == object and column != 'amenities':
        remove_column_y_n = getmaxlength(column)
        if remove_column_y_n > 300:
            remove_columns.append(column)

print(remove_columns)

['summary', 'space', 'description', 'neighborhood_overview', 'transit', 'access', 'interaction', 'house_rules', 'host_about']
