## Import and Scale Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

In [2]:
data = pd.read_csv('nyc_data3.csv')

In [3]:
data.rename(columns={'Median Sale Price - na has median':'house_price'}, inplace=True)

In [4]:
data_features = data[['uber_count', 'avg_price', 'pop_density', 'median_daily_test_rate', 'avg_test_score', 'house_price']]

In [5]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(data_features) # np array of scaled features

In [6]:
scaled_df = pd.DataFrame(data=scaled_features, columns=['uber_count', 'avg_price', 'pop_density', 'median_daily_test_rate', 'avg_test_score', 'house_price']) # scaled features as dataframe

## K-means Model for Suggestion

In [7]:
# K-means model instantiation
km = KMeans(n_clusters=3,
            n_init=10,
           random_state=12)

In [8]:
# fit_predict K-means model on scaled_df
y_km = km.fit_predict(scaled_df)

In [9]:
# create clusters column in data_features, data, and scaled_df
data_features['cluster'] = y_km
data['cluster'] = y_km
scaled_df['cluster'] = y_km

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_features['cluster'] = y_km


In [10]:
# create clusters based dataframes
nyc_0 = data_features.loc[data_features['cluster'] == 0]
nyc_1 = data_features.loc[data_features['cluster'] == 1]
nyc_2 = data_features.loc[data_features['cluster'] == 2]
# create clusters based dataframes for scaled dataframes as well
scaled_0 = scaled_df.loc[scaled_df['cluster'] == 0]
scaled_1 = scaled_df.loc[scaled_df['cluster'] == 1]
scaled_2 = scaled_df.loc[scaled_df['cluster'] == 2]

## NYC Neighborhood Suggestion Tool

In [11]:
# feature conversion key
convert = {'a1':scaled_df['uber_count'].min(), 'a2':scaled_df['uber_count'].quantile(0.25), 'a3':scaled_df['uber_count'].quantile(0.5),'a4':scaled_df['uber_count'].quantile(0.75), 'a5':scaled_df['uber_count'].max(),
          'b1':scaled_df['avg_price'].min(), 'b2':scaled_df['avg_price'].quantile(0.25), 'b3':scaled_df['avg_price'].quantile(0.5),'b4':scaled_df['avg_price'].quantile(0.75), 'b5':scaled_df['avg_price'].max(),
          'c1':scaled_df['pop_density'].min(), 'c2':scaled_df['pop_density'].quantile(0.25), 'c3':scaled_df['pop_density'].quantile(0.5),'c4':scaled_df['pop_density'].quantile(0.75), 'c5':scaled_df['pop_density'].max(),
          'd1':scaled_df['median_daily_test_rate'].min(), 'd2':scaled_df['median_daily_test_rate'].quantile(0.25), 'd3':scaled_df['median_daily_test_rate'].quantile(0.5),'d4':scaled_df['median_daily_test_rate'].quantile(0.75), 'd5':scaled_df['median_daily_test_rate'].max(),
          'e1':scaled_df['avg_test_score'].min(), 'e2':scaled_df['avg_test_score'].quantile(0.25), 'e3':scaled_df['avg_test_score'].quantile(0.5),'e4':scaled_df['avg_test_score'].quantile(0.75), 'e5':scaled_df['avg_test_score'].max(),
          'f1':scaled_df['house_price'].min(), 'f2':scaled_df['house_price'].quantile(0.25), 'f3':scaled_df['house_price'].quantile(0.5),'f4':scaled_df['house_price'].quantile(0.75), 'f5':scaled_df['house_price'].max(),}

##### NYC_Suggest preamble:

When deciding where to live in NYC, please consider the following questions on a scale of 1-5:

Entry a: Would use of Uber/3rd party transportation services be of importance to you?\
Entry b: Do you prefer cheap or expensive restaurants around you?\
Entry c: Do you prefer neighborhoods where things are more spread out or close together?\
Entry d: Is access to healthcare important to you?\
Entry e: What level of elementary education would you prefer? (Living in NYC as a family or alone?)\
Entry f: How expensive of a place would you be able to afford?

In [15]:
#NYC_Suggest('a3', 'b3', 'c3', 'd3', 'e3', 'f3')

In [13]:
def NYC_Suggest(a, b, c, d, e, f):
    '''
    Returns cluster grouping and suggested neighborhoods based on inputs.
    
        Parameters:
            a, b, c, d, e, f (str): strings that are letters ranging from a-f and numbers ranging from 1-5
            example: 'a3', 'b3', 'c3', 'd3', 'e3', 'f3'
        Returns:
            Printed returns of the belonging cluster grouping and the top 10 neighborhoods that the K-means algorithm suggests.
    '''
    
    X = [[convert[a], convert[b], convert[c], convert[d], convert[e], convert[f]]]
    
    belong = km.predict(X)
    if belong == 0:
        print('You fit in with cluster 0')
    elif belong == 1:
        print('You fit in with cluster 1')
    elif belong == 2:
        print('You fit in with cluster 2')

    print('\n')
    print('The Top 10 recommended neighborhoods for you to live in are:')
    neigh = NearestNeighbors(n_neighbors=10)
    neigh.fit(scaled_features)
    ind = neigh.kneighbors(X, return_distance=False)
    for i in ind:
        h = i.astype(int)
    for x in h:
        print(data.iloc[x]['name'])

#### Submission