In [2]:
# Import libraries
import pandas as pd

In [3]:
# Load the data
data = pd.read_csv('nyc_housing_avg_prices.csv')

In [4]:
# Explore the df
data.head()

Unnamed: 0,borough,neigborhood,studio,1_bedroom,2_bedroom,3_bedroom
0,manhatan,Chelsea,"$2,794","$4,062","$6,997","$11,031"
1,manhatan,East Village,"$2,608","$3,280","$4,044","$5,289"
2,manhatan,Flatiron/Union Square,"$3,632","$4,645","$8,267","$15,969"
3,manhatan,Gramercy Park,"$2,569","$3,656","$5,413","$7,226"
4,manhatan,Greenwich Village,"$2,828","$4,116","$8,000","$12,418"


#### From the above, we can see that some neighborhood names might be slightly different from our main dataframe with latitude and longitude. We may need to revisit this later.

## Feature Engineering

In [5]:
# Change the column names to match original df
cols = ['Borough', 'Neighborhood', 'Studio', '1_bedroom', '2_bedroom', '3_bedroom']

data.columns = cols

In [6]:
# Capitalize first letter in each borough
data['Borough'] = data.Borough.str.title()
data.head()

Unnamed: 0,Borough,Neighborhood,Studio,1_bedroom,2_bedroom,3_bedroom
0,Manhatan,Chelsea,"$2,794","$4,062","$6,997","$11,031"
1,Manhatan,East Village,"$2,608","$3,280","$4,044","$5,289"
2,Manhatan,Flatiron/Union Square,"$3,632","$4,645","$8,267","$15,969"
3,Manhatan,Gramercy Park,"$2,569","$3,656","$5,413","$7,226"
4,Manhatan,Greenwich Village,"$2,828","$4,116","$8,000","$12,418"


In [7]:
# Check for missing values
data.isnull().sum()

Borough         0
Neighborhood    0
Studio          0
1_bedroom       0
2_bedroom       0
3_bedroom       0
dtype: int64

In [8]:
# Check shape of df 
data.shape

(55, 6)

In [11]:
# Check data type
data.dtypes

Borough          object
Neighborhood     object
Studio           object
1_bedroom        object
2_bedroom        object
3_bedroom        object
Avg_Rent        float64
dtype: object

In [12]:
data.head()

Unnamed: 0,Borough,Neighborhood,Studio,1_bedroom,2_bedroom,3_bedroom,Avg_Rent
0,Manhatan,Chelsea,"$2,794","$4,062","$6,997","$11,031",
1,Manhatan,East Village,"$2,608","$3,280","$4,044","$5,289",
2,Manhatan,Flatiron/Union Square,"$3,632","$4,645","$8,267","$15,969",
3,Manhatan,Gramercy Park,"$2,569","$3,656","$5,413","$7,226",
4,Manhatan,Greenwich Village,"$2,828","$4,116","$8,000","$12,418",


In [24]:
# Replace striing with floats
data[data['Studio']].replace('[\$,]', '', regex=True).astype(float)

KeyError: "None of [Index(['$2,794', '$2,608', '$3,632', '$2,569', '$2,828', '$2,823', '$2,955',\n       '$2,710', '$5,121', '$2,683', '$3,589', '$3,502', '$2,765', '$2,676',\n       '$2,485', '$2,589', '$2,474', '$2,237', '$2,307', '$2,316', '$4,289',\n       '$2,696', '$2,215', '$2,328', '$2,362', '$2,662', '$2,329', '$2,330',\n       '$2,010', '$1,992', '$1,899', '$1,657', '$1,978', '$2,044', '$2,092',\n       '$2,509', '$2,050', '$2,271', '$2,409', '$2,100', '$1,806', '$2,443',\n       '$3,189', '$2,957', '$2,293', '$2,347', '$1,724', '$2,795', '$2,393',\n       '$1,933', '-', '$2,400', '$1,400', '$2,712', '$1,939'],\n      dtype='object')] are in the [columns]"

In [18]:
# Create new feature - average rent price
data['Avg_Rent'] = data[['Studio', '1_bedroom', '2_bedroom', '3_bedroom']].mean()