# Index baseline

In [None]:
%config IPCompleter.greedy=True

import yaml
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import psycopg2
#from os import path
import os
import os.path
import sys
import numpy as np
from sqlalchemy import create_engine

pd.set_option("display.max_rows", 50)
pd.set_option('display.max_columns', None)

# Set the path
home_dir = os.getcwd()
credential_dir = os.path.join('/mnt/data/projects/el_salvador_mined_education', 'ana', 'db_credentials')

In [None]:
def create_pgconn(credentials_yaml):
    with open(credentials_yaml) as f:
        configs = yaml.load(f)
    try: 
        conn = psycopg2.connect("dbname='{}' user='{}' host='{}' password='{}'".format(
            configs['database'],
            configs['user'],
            configs['host'],
            configs['password']))
    except: 
        print("Error connecting to db.")

    cur = conn.cursor()
    cur.execute("SET ROLE " + configs['role'])
    return conn
credentials_yaml = os.path.join(credential_dir, 'avaldivia_elsalvador.yaml') #example file on hitchikers repo
conn = create_pgconn(credentials_yaml)
def sql(query, conn=conn):
    return pd.read_sql(query, conn)

In [None]:
# Index data
df = sql("""
    SELECT a.student, a. year_range, a.school, a.school_index_avg, b.label from results.features_aggregate a LEFT JOIN staging.labels b
    ON a.student = b.student AND a.year_range = b.year_range
    WHERE a.school_index_avg IS NOT NULL 
    AND b.label IS NOT NULL
    AND  a.school_index_avg IS NOT NULL
    AND extract(year from lower(a.year_range)) = '2016'; 
""")



In [None]:
# Test and score vectors
y_test = df.label
y_score = df.school_index_avg

In [None]:
# Function for ploting prec and recall at k
from sklearn.metrics import precision_recall_curve
def plot_precision_recall_n(y_true, y_prob, model_name, output_type):
    y_score = y_prob
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score>=value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)

    plt.clf()
    fig, ax1 = plt.subplots()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')
    ax1.set_ylim([0,1])
    ax1.set_ylim([0,1])
    ax2.set_xlim([0,1])

    # name = model_name
    # plt.title(name)
    if (output_type == 'save'):
        plt.savefig(name)
    elif (output_type == 'show'):
        plt.show()
    else: plt.show()

In [None]:
# Plot precision and recall at k
plot_precision_recall_n(y_test, y_score, "Baseline_DT" ,'show')

In [None]:
# Compute AUC
from sklearn.metrics import roc_auc_score
print('AUC:', roc_auc_score(y_test, y_score))