In [20]:
# Import libraries
import pandas as pd

In [21]:
# Load the data
data = pd.read_csv('nyc_housing_avg_prices.csv')

In [22]:
# Explore the df
data.head()

Unnamed: 0,borough,neigborhood,studio,1_bedroom,2_bedroom,3_bedroom
0,manhatan,Chelsea,"$2,794","$4,062","$6,997","$11,031"
1,manhatan,East Village,"$2,608","$3,280","$4,044","$5,289"
2,manhatan,Flatiron/Union Square,"$3,632","$4,645","$8,267","$15,969"
3,manhatan,Gramercy Park,"$2,569","$3,656","$5,413","$7,226"
4,manhatan,Greenwich Village,"$2,828","$4,116","$8,000","$12,418"


From the above, we can see that some neighborhood names might be slightly different from our main dataframe with latitude and longitude. We may need to revisit this later.

## Feature Engineering

In [23]:
# Change the column names to match original df
cols = ['Borough', 'Neighborhood', 'Studio', '1_bedroom', '2_bedroom', '3_bedroom']

data.columns = cols

In [24]:
# Capitalize first letter in each borough
data['Borough'] = data.Borough.str.title()
data.head()

Unnamed: 0,Borough,Neighborhood,Studio,1_bedroom,2_bedroom,3_bedroom
0,Manhatan,Chelsea,"$2,794","$4,062","$6,997","$11,031"
1,Manhatan,East Village,"$2,608","$3,280","$4,044","$5,289"
2,Manhatan,Flatiron/Union Square,"$3,632","$4,645","$8,267","$15,969"
3,Manhatan,Gramercy Park,"$2,569","$3,656","$5,413","$7,226"
4,Manhatan,Greenwich Village,"$2,828","$4,116","$8,000","$12,418"


In [25]:
# Check for missing values
data.isnull().sum()

Borough         0
Neighborhood    0
Studio          0
1_bedroom       0
2_bedroom       0
3_bedroom       0
dtype: int64

In [26]:
# Check shape of df 
data.shape

(55, 6)

In [27]:
# Check data type
data.dtypes

Borough         object
Neighborhood    object
Studio          object
1_bedroom       object
2_bedroom       object
3_bedroom       object
dtype: object

In [28]:
data.head()

Unnamed: 0,Borough,Neighborhood,Studio,1_bedroom,2_bedroom,3_bedroom
0,Manhatan,Chelsea,"$2,794","$4,062","$6,997","$11,031"
1,Manhatan,East Village,"$2,608","$3,280","$4,044","$5,289"
2,Manhatan,Flatiron/Union Square,"$3,632","$4,645","$8,267","$15,969"
3,Manhatan,Gramercy Park,"$2,569","$3,656","$5,413","$7,226"
4,Manhatan,Greenwich Village,"$2,828","$4,116","$8,000","$12,418"


In [32]:
# First, let's remove the row with missing values
# We set them aside as separate dataframe
y = data.loc[data['Studio'].isin(['-'])]
data.drop([50], inplace=True)

In [33]:
# Remove special characters from the dataframe 
data[['Studio', '1_bedroom', '2_bedroom', '3_bedroom']] = data[['Studio', '1_bedroom', '2_bedroom', '3_bedroom']].replace('[\$,]', '', regex=True)
data.head()

Unnamed: 0,Borough,Neighborhood,Studio,1_bedroom,2_bedroom,3_bedroom
0,Manhatan,Chelsea,2794,4062,6997,11031
1,Manhatan,East Village,2608,3280,4044,5289
2,Manhatan,Flatiron/Union Square,3632,4645,8267,15969
3,Manhatan,Gramercy Park,2569,3656,5413,7226
4,Manhatan,Greenwich Village,2828,4116,8000,12418


In [34]:
# Convert rent prices from string to float
data[['Studio', '1_bedroom', '2_bedroom', '3_bedroom']] = data[['Studio', '1_bedroom', '2_bedroom', '3_bedroom']].astype(float)

In [37]:
# Create new feature - average rent price
data['Avg_Rent'] = data[['Studio', '1_bedroom', '2_bedroom', '3_bedroom']].mean(axis=1)
data.head()

Unnamed: 0,Borough,Neighborhood,Studio,1_bedroom,2_bedroom,3_bedroom,Avg_Rent
0,Manhatan,Chelsea,2794.0,4062.0,6997.0,11031.0,6221.0
1,Manhatan,East Village,2608.0,3280.0,4044.0,5289.0,3805.25
2,Manhatan,Flatiron/Union Square,3632.0,4645.0,8267.0,15969.0,8128.25
3,Manhatan,Gramercy Park,2569.0,3656.0,5413.0,7226.0,4716.0
4,Manhatan,Greenwich Village,2828.0,4116.0,8000.0,12418.0,6840.5
