## Cincinnati Blight: Retrain Models
____

In [None]:
import pandas as pd
import numpy as np
import sklearn
import os

# Config & database
from sqlalchemy import create_engine
import yaml
import pymongo
from pymongo import MongoClient

from lib_cinci.train_and_predict import main, predict_on_date
from lib_cinci import dataset


In [None]:
# Configuration and DB connection
from sklearn_evaluation.Logger import Logger

folder = os.environ['ROOT_FOLDER']
name = 'config.yaml'
path = "%s/%s" % (folder, name)
f = open(path, 'r')
text = f.read()
main = yaml.load(text)

def load(name):
    folder = os.environ['ROOT_FOLDER']
    path = "%s/%s" % (folder, name)
    with open(path, 'r') as f:
        text = f.read()
    dic = yaml.load(text)
    return dic

connparams = load('config.yaml')['db']
uri = '{dialect}://{user}:{password}@{host}:{port}/{database}'.format(**connparams)
libpq_uri = 'dbname={database} user={user} host={host} password={password} port={port}'.format(**connparams)


engine = create_engine(uri)
logger = Logger(host=main['logger']['uri'], db=main['logger']['db'], 
                collection=main['logger']['collection'])

In [None]:
model_groups = [18711, 1120, 14613, 5716, 
                27039, 7111, 28879, 26523, 
                1309, 12547, 10062, 28108, 
                11814, 7068, 29230, 25683, 7520]

In [None]:
k = 7500 # top 5% of parcels

In [None]:
models = pd.read_csv('model-results-grouped.csv')
model_id_cols = [col for col in list(models) if col.startswith('model_id')]
model_name_cols = [col for col in list(models) if col.startswith('name')]

In [None]:
# Get neighborhood information to save with predictions
parcel_info = pd.read_csv('parcels_with_neighborhood_31aug2016.csv', index_col='parcel_id')

In [None]:
all_top_k = {}

In [None]:
for m in model_groups: 

    model_group = str(m)
    model_id = str(models[model_id_cols].iloc[m].dropna().values[0])
    
    # Retrain model and get predictions on all parcels
    trained_model_df, trained_model_dict = main(model_id=model_id, 
                                                train_end_date='30Aug2016',  
                                                prediction_schema='features_31aug2016', 
                                                return_features=True, return_fitted=True)       
    trained_model_df.sort_values('prediction', ascending=False, inplace=True)

    # Add neighborhood metrics for each parcel and save to CSV
    model_predictions = trained_model_df[['prediction']].join(parcel_info)
    all_top_k[model_group] = model_predictions.head(7500)

    # Save feature importances to CSV
    feature_importances = pd.DataFrame(data = [trained_model_df.columns[:-1], 
                                               trained_model_dict['model'].feature_importances_]).T
    feature_importances.columns = ['feature', 'feature_importance']
    output_path = os.path.join('feature_importances', 'feature_importances_' + model_group + '.csv')
    feature_importances.to_csv(output_path)
    
    # Get list of top k parcels below median ID
    inspection_density_first_quartile = model_predictions['inspection_density'].quantile(0.25)
    median_mask = model_predictions.inspection_density < inspection_density_median
    below_median_ID = model_predictions[median_mask].head(k)
    all_top_k[model_group + ' Below Median ID'] = below_median_ID

    # Get list of top k parcels below first quartile ID
    inspection_density_median = model_predictions.inspection_density.median()
    first_quartile_mask = model_predictions.inspection_density < inspection_density_first_quartile
    below_quartile_ID = model_predictions[first_quartile_mask].head(k)
    all_top_k[model_group + ' Below First Quaritle ID'] = below_quartile_ID
    

In [None]:
all_top5 = pd.concat(top_k.values())
all_top5['violations_per_house'] = all_top5['violation_rate'] * all_top5['inspection_density'] 
all_top5.drop('Unnamed: 0', axis=1, inplace=True)
all_top5.to_csv('all_top5.csv')

# Make Feature Crosstabs 

In [None]:
query = '''
        SELECT DISTINCT (table_name) 
        FROM information_schema.tables 
        WHERE table_schema = 'features_31aug2016';
        '''
all_tables = pd.read_sql(query, engine)

In [None]:
all_features = {}

for t in list(all_tables.table_name):

    query = 'SELECT * FROM features_31aug2016.{table};'.format(table=t)
    
    features = pd.read_sql(query, engine, index_col = 'parcel_id')
    features.columns = [t + '.' + str(col) for col in features.columns]
    all_features[t] = features

In [None]:
all_features = pd.concat(all_features.values())

In [None]:
all_features_mean = all_features.mean(axis=0)
all_features_mean = all_features_mean.append(pd.Series([1.0], index=['model_group']))

In [None]:
all_features_mean_df = all_features_mean.to_frame().T

In [None]:
feature_averages = {}

for m in model_groups: 
    list_name = str(m)
    
    model_features = all_features[all_features.index.isin(top_k[model_num].index)].mean(axis=0)
    model_features = model_features.to_frame().T
    
    feature_averages[list_name + ' Top 5'] = model_features
    feature_averages[list_name + ' Top 5']['model_group'] = model_num
    feature_averages[list_name + ' Top 5']['subset'] = 'Top 5 Average'
    feature_averages[list_name + ' Top 5']['list'] = 'All Parcels'
    
    feature_averages[list_name + ' Ratio'] = model_features.divide(all_features_mean_df, axis=1)
    feature_averages[list_name + ' Ratio']['model_group'] = model_num
    feature_averages[list_name + ' Ratio']['subset'] = 'Ratio'
    feature_averages[list_name + ' Ratio']['list'] = 'All Parcels'
            

In [None]:
crosstabs = pd.concat(feature_averages.values())
cts = crosstabs.append(all_features_mean_df)
cts.set_index(['model_group','list','subset'], inplace=True)
cts.reset_index(inplace=True)
cts['new_index'] = cts['model_group'].map(int).map(str) + ' ' + cts['list'] + ' ' + cts['subset']
cts.set_index('new_index', inplace=True)
cts.T.to_csv('feature_crosstabs.csv')

# Pairwise Overlap Between of top *k* Parcels Between Model Lists

Compute the Jaccard matrix for all of the models. The *ij*th element of this matrix is the size of the intersection of the top *k* for models *i* and *j* divided by the size of their union. If models are sorting mostly the same parcels to the top *k*, then they are pretty equivalent.

In [None]:
# From Joco batch_evaluator
def compute_similarity(prediction_matrix, percent=True):
        """ Given a matrix of individuals classified as positive from different
        models, return a correlation-matrix-like matrix of jaccard similarities.
        :param prediction_matrix: lists of top X indiviudals with highest risk
                                  scores according to different models
        :type prediction_matrix: pandas DataFrame 
        :returns: jaccard matrix
        :rtype: pandas DataFrame
        """

        jaccard_matrix = pd.DataFrame(index = prediction_matrix.columns.values,
                                      columns = prediction_matrix.columns.values)
        for col_a in prediction_matrix.columns:
            position = prediction_matrix.columns.get_loc(col_a)
            for col_b in prediction_matrix.ix[:,position:]:
                intersection_cardinality = len(set.intersection(*[set(prediction_matrix[col_a]),
                                               set(prediction_matrix[col_b])]))
                
                
                if percent:
                    jaccard = intersection_cardinality/float(k)
                else:
                    union_cardinality = len(set.union(*[set(prediction_matrix[col_a]),
                                        set(prediction_matrix[col_b])]))
                    jaccard = intersection_cardinality/float(union_cardinality)
                
                jaccard_matrix.loc[col_a, col_b] = jaccard
                jaccard_matrix.loc[col_b, col_a] = jaccard

        return(jaccard_matrix.astype(float))

In [None]:
top_k_keys = {10062: 'ID (10062)', 
              12547: 'ID (12547)',  
              7111: 'ID (7111)',
             28108: 'ID (28108)', 
             11814: 'ID (11814)',
              18711: 'P@5 (18711)',
              1120: 'P@5 (1120)',
              14613: 'P@5 (14613)',
              5716: 'P@5 (5716)',
              7520: 'All 3 (7520)',
              1309: 'All 3 (1309)',
              26523: 'All 3 (26523)',
              28879: 'All 3 (28879)',
              27039: 'VR (27039)',
              7068: 'VR (7068)',
              29230: 'VR (29230)',
              25683: 'VR (25683)'
             }

In [None]:
d = {k: v.index.values for k,v in top_k.iteritems()}
top_k_dict = {top_k_keys[k]: v for k, v in d.items()}

In [None]:
prediction_matrix = pd.DataFrame.from_dict(top_k_dict)
df = compute_similarity(prediction_matrix)

In [None]:
cmap = sns.cubehelix_palette(dark=0, light=1, start=.5, rot=-.75, as_cmap=True)

f,ax = plt.subplots(figsize=(20, 15))

ax = sns.heatmap(df,linewidths=0.5, vmin=0, vmax=1, cmap=cmap)
plt.xticks(rotation=45, ha='right')
ax.figure.savefig('percent_similarity_top' + str(p) + '_' + date_tag)