In [24]:
##__Date__:Apr 28
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import roc_auc_score as rs
from sklearn.externals.six import StringIO  
from sklearn import tree
import math
from IPython.display import Image  
import pydotplus

# read in aggregated data
sj = pd.read_csv('spatial_merged_solid.csv')
sj = sj[['CAMIS', 'DBA', 'BBL', 'new_grade', 'yelp_rating', 'yelp_categories', 'yelp_review_count', 'stats.checkinsCount', 'stats.tipCount', 'stats.usersCount',
       'categories', 'geometry', 'BldgArea', 'BuiltFAR', 'CB2010', 'CT2010', 'HealthArea', 'NumFloors','OfficeArea',
        'ResArea', 'RetailArea', 'SHAPE_Area', 'SHAPE_Leng', 'TaxMap', 'Tract2010', 'UnitsRes', 'UnitsTotal', 'YearAlter1', 'YearAlter2', 'YearBuilt']]
sj = sj.rename(columns = {'yelp_rating':'yr', 'yelp_categories':'yc', 'yelp_review_count': 'yrc', \
                            'stats.checkinsCount':'fcc', 'stats.tipCount':'ftc', 'stats.usersCount':'fus', 'categories': 'fc'})
# consistent format
sj.BBL = sj.BBL.apply(int).apply(str)

# sj['retail_area_p']: portions of retail area vs. shape area
sj['retail_area_p'] = sj.RetailArea / sj.SHAPE_Area
# sj['res_u_p']: portions of residential units vs. total units
sj['res_u_p'] = sj.UnitsRes / sj.UnitsTotal

# load DOF: Condominium Comparable Rental Income as income, real estate rental price indicator
# timerange: 2011/2012
# notice: lots of nan
tm_mn = pd.read_json('https://data.cityofnewyork.us/resource/jmwn-n499.json')
tm_bk = pd.read_json('https://data.cityofnewyork.us/resource/dkr7-ts75.json')
tm_qn = pd.read_json('https://data.cityofnewyork.us/resource/rrrc-7d8r.json')
tm_bx = pd.read_json('https://data.cityofnewyork.us/resource/ieph-a6af.json')
tm_si = pd.read_json('https://data.cityofnewyork.us/resource/4rqc-79ny.json')

# concatenating
def rep_borough(col_list):
    return map(lambda x: x.replace('manhattan', '').replace('brooklyn', '').replace('bronx', '')\
    .replace('queens', '').replace('staten_island', ''), col_list)
for dt in [tm_mn, tm_bk, tm_bx, tm_qn, tm_si]:
    dt.columns = rep_borough(dt.columns.values)
tm_m = pd.concat([tm_mn, tm_bk, tm_bx, tm_qn, tm_si])

# clean BBL column in tax map as STRING
tm_m.rename(columns={'_condominiums_comparable_properties_boro_block_lot' : 'BBL'}, inplace = True)
tm_m.BBL = tm_m.BBL.apply(lambda x: str(x.replace('-', '')))
tm_m_clean = tm_m[['_condominiums_comparable_properties_expense_per_sqft', '_condominiums_comparable_properties_gross_income_per_sqft',
                 '_condominiums_comparable_properties_net_operating_income', 'BBL']]
tm_m_clean.columns = ['expense/sqft', 'income/spft', 'opincome', 'BBL']

tm_m_clean.head()

## Categories feature:
- descriptive feature is given by: Yelp(yc), Foursquare(fc), and inspection data itself  
- consider approach:   
    1. congregate discriptive words from all the 3 sources for each CAMIS  
    2. process a long list of words existed in the three columns - return a dictionary(?) of categories  
      eg: dic = {'healthy': ['green', 'yogurt', 'salads'], 'american':['hamburgers', 'milkshake', 'fries']}  
    3. A boolean matrix M of shape (len(CAMIS), len(dic.keys())  
       M[camis, 1] = 1 if any(row['category']) in dic.items()[1][1]  
       but: more features to processssssss

# merge spatial joined with tax map
sj1 = pd.merge(sj, tm_m_clean, on = 'BBL', how = 'left')

def alter(row):
    '''
    return # of times each building is altered
    '''
    if row['YearAlter2'] != 0.0:
        t = 2
    elif row['YearAlter1'] != 0.0:
        t = 1
    else:
        t = 0
    return t
sj1['alter_count'] = sj1.apply(alter, axis = 1)

sj2 = sj1[['CAMIS', 'DBA', 'BBL', 'new_grade', 'yr', 'yc', 'yrc', 'fcc', 'ftc','fus', 'fc', 
           'NumFloors', 'retail_area_p', 'res_u_p', 'alter_count', 'YearBuilt',
           'expense/sqft', 'income/spft', 'opincome', 'geometry']]

# messing with Y: drop any rows that are missing a new_grade
sj2 = sj2.loc[sj2.new_grade.dropna().index, :] 
sj2['gradeA'] = sj2.apply(lambda x: x['new_grade'] == 'A', axis = 1) #return boolean: true if A

#from original inspection data extract 'CUISINE DESCRIPTION' column
def sum1(seq):
    tot = ' '
    for n in seq:
        tot = tot + n + ','
    return set(tot.split(','))

insp_ori = pd.read_csv('DOHMH_New_York_City_Restaurant_Inspection_Results.csv')
insp_cat = insp_ori[['CAMIS','CUISINE DESCRIPTION']].groupby('CAMIS', as_index = False).agg({"CUISINE DESCRIPTION": sum1})

def clean_cuisine_set(x):
    y = filter(lambda x: len(x) >0, map(lambda x: x.strip().lower(), x))
    return set(y)
insp_cat['CUISINE DESCRIPTION'] = insp_cat['CUISINE DESCRIPTION'].apply(clean_cuisine_set)
sj3 = pd.merge(sj2, insp_cat, on = 'CAMIS', how = 'left')#add 'CUISINE DESCRIPTION' column to sj3

type(sj3['CUISINE DESCRIPTION'][3])

# join yelp categories and inspection categories
def join_cat_y(x):
    if type(x['yc']) != float:
        if ',' in x['yc']:
            yclist = map(lambda x: x.replace("[u'", '').replace("']", '').replace('u', '').replace('[', '').replace(']', '').\
                         replace("'",'').replace(' ','').replace('caf\xc3\x83\xc2\xa9/coffee/tea', 'cafe').lower(), x['yc'].split(','))
        else:
            yclist = map(lambda x: x.replace("[u'", '').replace("']", '').replace('u', '').replace('[', '').replace(']', '').\
                         replace("'",'').replace(' ','').replace('caf\xc3\x83\xc2\xa9/coffee/tea', 'cafe').lower(), x['yc'].split('u'))
        if type(x['CUISINE DESCRIPTION']) == set:
            return x['CUISINE DESCRIPTION'].union(set(yclist))
        else:
            return yclist
    else:
        return set()
# save joined categories    
sj3['categories'] = sj3.apply(join_cat_y, axis = 1).apply(list)

# join four square categories to 'categories' column

def join_cat_f(x):
    if type(x['fc']) != float:
        if ',' in x['fc']:
            fclist = map(lambda x: x.replace("[u'", '').replace("']", '').replace('u', '').replace('[', '').replace(']', '').\
                         replace("'",'').replace(' ','').replace('caf\xc3\x83\xc2\xa9/coffee/tea', 'cafe').replace('Caf\xe9', 'cafe')\
                         .lower(), x['fc'].split(','))
        else:
            fclist = map(lambda x: x.replace("[u'", '').replace("']", '').replace('u', '').replace('[', '').replace(']', '').\
                         replace("'",'').replace(' ','').replace('caf\xc3\x83\xc2\xa9/coffee/tea', 'cafe').replace('Caf\xe9', 'cafe')\
                         .lower(), x['fc'].split('u'))
        if type(x['CUISINE DESCRIPTION']) == set:
            return x['CUISINE DESCRIPTION'].union(set(fclist))
        else:
            return fclist
    else:
        return set()

sj3['categories'] = sj3.apply(join_cat_f, axis = 1).apply(list)

sj3.head()

# process all categories
des = np.asarray(sj3['categories']).flatten()
des_v = []
for i in range(len(des)):
    for j in range(len(des[i])):
        #cleanning on the string:
        des[i][j] = des[i][j].replace(' ','')
        des[i][j] = des[i][j].replace('.','')
        des[i][j] = des[i][j].replace('including','')
        des[i][j] = des[i][j].replace('caf\xc3\x83\xc2\xa9','cafe')
        des[i][j] = des[i][j].replace('etc','')
        des[i][j] = des[i][j].replace('notapplicable','')
        des[i][j] = des[i][j].replace('notlisted','')
        des[i][j] = des[i][j].replace('a','').replace('an','').replace('accessories', '').replace('ad','')
        
        des_v.append(des[i][j].lower())
        
s = set(des_v)

# consider text categories?
s #'THE LONG LIST' of descriptive words

#### FORGET about categories first
## Some decision trees for implementation

import pandas as pd

x = pd.read_csv('feature.csv',index_col = 0)
y = pd.read_csv('y.csv', index_col=0, names = ['ifA'])

# data
sj3['ifA'] = sj3['new_grade'] == 'A' # create boolean list indicating if new_grade == A
sj4 = sj3.drop(['yc', 'fc'], axis = 1).replace(np.nan, 0) # ignore categories column first, replace all nan with 0

print 'look at features:', sj4.columns[4:17].values

from sklearn.model_selection import train_test_split
xtr, xte, ytr, yte = train_test_split(x, y, test_size = .3, random_state = 99)
#xtr, xte, ytr, yte = train_test_split(sj4[sj4.columns[4:17]], sj4['ifA'], test_size = 0.3, random_state = 99)

# train decision tree
rf_ = DTC(max_depth=2) 
rf_.fit(xtr, ytr)
#viz
dot_data = StringIO()  
tree.export_graphviz(rf_, out_file=dot_data,  
                         feature_names=x.columns.values,  
                         class_names=['A', 'not A'],  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
img = Image(graph.create_png())

with open("../viz/dtree2.png", "wb") as png:
    png.write(img.data)

# train decision tree
rf_ = DTC(max_depth=3) 
rf_.fit(xtr, ytr)
#viz
dot_data = StringIO()  
tree.export_graphviz(rf_, out_file=dot_data,  
                         feature_names=x.columns.values,  
                         class_names=['A', 'not A'],  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
img = Image(graph.create_png())

with open("../viz/dtree3.png", "wb") as png:
    png.write(img.data)

# train decision tree
rf_ = DTC(max_depth=4) 
rf_.fit(xtr, ytr)
#viz
dot_data = StringIO()  
tree.export_graphviz(rf_, out_file=dot_data,  
                         feature_names=x.columns.values,  
                         class_names=['A', 'not A'],  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
img = Image(graph.create_png())

with open("../viz/dtree4.png", "wb") as png:
    png.write(img.data)

# train decision tree
rf_ = DTC(max_depth=5) 
rf_.fit(xtr, ytr)
#viz
dot_data = StringIO()  
tree.export_graphviz(rf_, out_file=dot_data,  
                         feature_names=x.columns.values,  
                         class_names=['A', 'not A'],  
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
img = Image(graph.create_png())

with open("../viz/dtree5.png", "wb") as png:
    png.write(img.data)

#performance
pred=rf_.predict_proba(xte)
pred1 = rf_.predict(xte)
def acc(true, pred):
    ''' true and pred are labled list
    of the same size
    return the percentage of rightly-prediced labels'''
    return 1.0*(pred==np.asarray(true)).sum()/len(true) 
print 'decision tree prediction accuracy:', 1.0*(yte.ifA == pred1).sum()/len(yte)

from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

print 'max_depth 5 decision tree:'
print 'roc_auc_score:'
print roc_auc_score(yte, pred1)
print 'average_precision_score:'
print average_precision_score(yte, pred1)

(pred1==np.asarray(yte)).sum()

1.0*(pred1==np.asarray(yte)).sum()/len(yte) 

print 'in the whole data set, p(not A) =', 1.0 * sj4[sj4['ifA'] == 0].size/ sj4.size
print 'in testing data set, p(not A) = ', 1.0 * yte[yte == 0].size/ yte.size

yte