In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sodapy import Socrata
import config
from IPython.display import display, HTML
from useful.eda import basic_info

##### Import the data from the api

In [None]:
if 'nyc311_data.csv' in os.listdir('./data'):
    OVERWRITE = False
else:
    OVERWRITE = True

OVERWRITE

In [None]:
client = Socrata(config.url,
                 config.api_key,
                 username=config.username,
                 password=config.password)

client.timeout = 300

SELECT = ['created_date',
          'unique_key',
          'complaint_type',
          'incident_zip',
          'incident_address',
          'street_name',
          'address_type',
          'city',
          'resolution_description',
          'borough',
          'latitude',
          'longitude',
          'closed_date',
          'location_type',
          'status']

LIMIT_START = 0
LIMIT = 100000
LIMIT_TOT = 10000000

if OVERWRITE == True:
    print('fetching data from website...')
    data_1 = {'df_'+ str(i): 
        pd.DataFrame.from_records(client.get("erm2-nwe9",
                                             limit=LIMIT, 
                                             Agency='HPD', 
                                             offset=x, 
                                             select=','.join(SELECT))
                            ) 
        for i,x in zip(range(100),[x+LIMIT for x in range(LIMIT_START,LIMIT_TOT) if x % LIMIT == 0])
    }
    
    print('processing data...')
    df_1 = data_1['df_0']

    for key in data_1.keys():
        if key != 'df_0':
            df_1 = pd.concat([df_1,data_1[key]],sort=False)
    
    print('converting to csv...')
    df_1.to_csv('./data/nyc311_data.csv')
    df = pd.read_csv('./data/nyc311_data.csv',low_memory=False)
    #display(HTML(df.head(1).to_html()))
    basic_info.data_info(df,None)
else:
    print('file exists')
    print('loading file...')
    df = pd.read_csv('./data/nyc311_data.csv',low_memory=False); 
    #display(HTML(df.head(1).to_html()))
    basic_info.data_info(df,None)

##### Prep the borough data

In [None]:
if os.listdir('./data/PLUTO_for_WEB/')[0].endswith('.csv') == True:
    boroughs = {'df_' + i[:2]: 
                pd.read_csv(f'./data/PLUTO_for_WEB/{i}',low_memory=False) 
                for i in [x for x in os.listdir('./data/PLUTO_for_WEB/') if x.endswith('.csv')]
            }
    print(boroughs['df_QN'].head(2))
else:
    print('download data first')

##### Question #1: Top Complaints

In [None]:
df['complaint_type'].value_counts()

Heating, hot water are probably the same so we can combine them also plumbing is probably pretty close too...

In [None]:
df['complaint_type'] = df['complaint_type'].apply(lambda x: 'HEATING' if x == 'HEAT/HOT WATER' else x)
df['complaint_type'].value_counts()

##### Where are the complaints located?

In [None]:
import folium
from folium.plugins import HeatMap

latitude,longitude = 40.73, -73.93

# create map and display it
nyc_map = folium.Map(location=[latitude, longitude], zoom_start=10)

In [None]:
limit=100000

df_heat = df[df['complaint_type']=='HEATING']

df_complaint = df_heat.iloc[0:limit,:]; df_complaint.head(1)

In [None]:
HeatMap(data=df_complaint[['latitude','longitude']].dropna(),radius=8, max_zoom=10).add_to(nyc_map)
nyc_map

In [None]:
import seaborn as sns

df_bar = df_heat['incident_zip'].value_counts().head(20)

plt.figure(figsize=(16, 6))
sns.barplot(x=df_bar.index,y=df_bar.values,order=df_bar.index,palette='Blues')
plt.title('Heat Complaint by Zip')
plt.xlabel('zip')
plt.ylabel('count')

plt.show()

##### Heat - top 5 Zips

11226 - Flatbush, Brooklyn <br/>
10467 - East Bronx <br/>
10458 - Bronx near Fordham <br/>
10468 - Fordham Heights <br/>
10453 - Near Yankee stadium <br/>

In [None]:
df_str = df_heat['incident_address'].value_counts().head(20)

plt.figure(figsize=(16, 6))
sns.barplot(x=df_str.index,y=df_str.values,order=df_str.index,palette='Blues')
plt.title('Noise Complaint by street')
plt.xlabel('street')
plt.ylabel('count')
plt.xticks(rotation=45)

plt.show()

In [None]:
import folium
from folium.plugins import HeatMap

latitude,longitude = 40.73, -73.93

# create map and display it
bx_map = folium.Map(location=[latitude, longitude], zoom_start=10)

df_bx = df_heat[df_heat['borough']=='BRONX']

HeatMap(data=df_bx[['latitude','longitude']].dropna(),radius=8, max_zoom=10).add_to(bx_map)
bx_map

In [None]:
#bx_str = df_bx['incident_address'].value_counts().head(20)
# test_df = df_noise[['latitude','longitude','incident_address']]

# _df = pd.DataFrame(test_df['incident_address'].value_counts()).reset_index()
# _df.columns = ['incident_address','count']

# pd.merge(_df,test_df,how='outer')

In [None]:
# df_noise['street_name'].value_counts()

In [None]:
import folium
from folium.plugins import HeatMap

latitude,longitude = 40.73, -73.93

# create map and display it
bk_map = folium.Map(location=[latitude, longitude], zoom_start=11)

df_bk = df_heat[df_heat['borough']=='BROOKLYN']

HeatMap(data=df_bk[['latitude','longitude']].dropna(),radius=8, max_zoom=10).add_to(bk_map)
bk_map

##### Model complaint types - BX

In [None]:
col = ['Address', 
     'BldgArea', 
     'BldgDepth', 
     'BuiltFAR', 
     'CommFAR', 
     'FacilFAR', 
     'Lot', 
     'LotArea', 
     'LotDepth', 
     'NumBldgs', 
     'NumFloors', 
     'OfficeArea', 
     'ResArea', 
     'ResidFAR', 
     'RetailArea', 
     'YearBuilt', 
     'YearAlter1', 
     'ZipCode', 
     'YCoord', 
     'XCoord']


bx_ = boroughs['df_BX'].loc[:,col]

In [None]:
df[df['borough']=='BRONX']['complaint_type'].value_counts().head(5).index

In [None]:
#pd.get_dummies(df_bx_['location_type']).head()

In [None]:
#df_bx = df[df['borough']=='BRONX']
complaints = [x for x in df[df['borough']=='BRONX']['complaint_type'].value_counts().head(5).index]

#tuple(complain
df_bx = df[df['borough']=='BRONX']

df_bx_ = df_bx[df_bx['complaint_type'].apply(lambda x: x.endswith(tuple(complaints)))]
#df_bx_ = df_bx[df_bx['complaint_type'].apply(lambda x: x.endswith('HEATING'))]
#df_bx_ = df_bx[df_bx['complaint_type']=='HEATING']

In [None]:
import seaborn as sns

df_bx_col = df_bx_[['incident_address','complaint_type']].rename(columns={'incident_address':'Address'})
df_bx_merged = pd.concat([df_bx_col,
                          pd.get_dummies(df_bx_col['complaint_type']),
                          #pd.get_dummies(df_bx_['location_type'])
                        ],
                axis=1)

fig, ax = plt.subplots(figsize=(20,16))
sns.heatmap(df_bx_merged.merge(bx_, on='Address').dropna().corr().round(2),cmap='coolwarm',annot=True)

plt.show()

In [None]:
data = df_bx_merged.merge(bx_, on='Address').dropna(); data.head()
#data = data[data['complaint_type']=='HEATING']
data['HEATING'].value_counts()

In [None]:
[x for x in data.columns if x.isupper() and x != 'HEATING']

In [None]:
def roc_plot(fpr, tpr, roc_auc):
    # method I: plt
    import matplotlib.pyplot as plt
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

#### NB

In [None]:
#d={i: x for x,i in enumerate(data['complaint_type'].unique())}
#data['encoded'] = data['complaint_type'].apply(lambda x: d[x])

In [None]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

clf = GaussianNB()
skip_col = [x for x in data.columns if x.isupper() and x != 'HEATING']

#X,y = normalize(data.loc[:,[x for x in col if x != 'Address' and x != skip_col]]),data['encoded']
X,y = normalize(data.loc[:,[x for x in col if x != 'Address' and x != skip_col]]),data['HEATING']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=42)

In [None]:
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.calibration import CalibratedClassifierCV

per = Perceptron()

clf1 = SVC(C=0.1, kernel='linear', gamma=1, class_weight='balanced', probability=True)
clf2 = GaussianNB()
clf3 = CalibratedClassifierCV(per, method='isotonic')
#clf3 = RandomForestClassifier(n_estimators=50, random_state=1, max_features=8, min_samples_leaf=8)

eclf = VotingClassifier(estimators=[('svc', clf1), 
                                    ('gnb', clf2),
                                    ('per', clf3)],
                                    voting='soft')

In [None]:
t_X_train,t_y_train = X_train[:10000],y_train[:10000]
t_X_test,t_y_test = X_test[:10000],y_test[:10000]

eclf.fit(t_X_train,t_y_train)
t_y_pred = eclf.predict(t_X_test)
print(classification_report(t_y_test, t_y_pred))

probs = eclf.predict_proba(t_X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(t_y_test, preds)
roc_auc = auc(fpr, tpr)
roc_plot(fpr,tpr,roc_auc)

In [None]:
# eclf.fit(X_train,y_train)
# y_pred = eclf.predict(X_test)
# print(classification_report(y_test, y_pred))

# probs = eclf.predict_proba(X_test)
# preds = probs[:,1]
# fpr, tpr, threshold = roc_curve(y_test, preds)
# roc_auc = auc(fpr, tpr)
# roc_plot(fpr,tpr,roc_auc)

#### RF

In [None]:
#param tune RFC
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

param_dist = {'max_depth': [3,None],
              'max_features': randint(1,9),
              'min_samples_leaf': randint(1,9),
              'criterion': ['gini','entropy']
         }

tree = RandomForestClassifier(n_estimators=50)

tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

tree_cv.fit(X,y)

print('Tuned decision tree parameters: {}'.format(tree_cv.best_params_))
print('Best score is: {}'.format(tree_cv.best_score_))

#### NN

In [None]:
from sklearn.linear_model import Perceptron

X,y = normalize(data.loc[:,[x for x in col if x != 'Address']]),data['encoded']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

perceptron = Perceptron()
perceptron.fit(X_train,y_train)

y_pred = perceptron.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100,))
mlp.fit(X_train,y_train)

y_pred = mlp.predict(X_test)
print(classification_report(y_test,y_pred))