# EDA

### An Exploratory Data Analysis on the Salvadoran students

In [None]:
%config IPCompleter.greedy=True

import yaml
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import psycopg2
import numpy as np
#from os import path
import os
import os.path
import sys
from sqlalchemy import create_engine

pd.set_option("display.max_rows", 50)
pd.set_option('display.max_columns', None)

# Set the path
home_dir = os.getcwd()
credential_dir = os.path.join('/mnt/data/projects/el_salvador_mined_education', 'ana', 'db_credentials')

In [None]:
def create_pgconn(credentials_yaml):
    with open(credentials_yaml) as f:
        configs = yaml.load(f)
    try: 
        conn = psycopg2.connect("dbname='{}' user='{}' host='{}' password='{}'".format(
            configs['database'],
            configs['user'],
            configs['host'],
            configs['password']))
    except: 
        print("Error connecting to db.")

    cur = conn.cursor()
    cur.execute("SET ROLE " + configs['role'])
    return conn
credentials_yaml = os.path.join(credential_dir, 'avaldivia_elsalvador.yaml') #example file on hitchikers repo
conn = create_pgconn(credentials_yaml)
def sql(query, conn=conn):
    return pd.read_sql(query, conn)

Let's get all the columns from all tables in schema 'cleaned':

### Take a look at student_registration

### Take a look at the columns variables

In [None]:
count_students_diff_dpto = sql("""
    select count(distinct(nie))
    from cleaned.students_labelled
    where dpto_code_student != dpto_code_ce;
""")

count_total_students = sql("""
    select count(distinct(nie))
    from cleaned.students_labelled;
""")

count_students_diff_dpto/count_total_students

In [None]:
import plotly.plotly as py

In [None]:
count_students_dpto = sql("""
    select dpto_name_student, dpto_name_ce, count(*)
    from cleaned.students_labelled
    group by dpto_name_student, dpto_name_ce
    having (dpto_name_student != 'None') and (dpto_name_student != 'EXTRANJERO');
""")

In [None]:
# create data table for diagram
df = count_students_dpto.copy()

dptos = {'AHUACHAPAN': 1, 'SANTA ANA': 2, 'SONSONATE': 3, 'CHALATENANGO' : 4, 
           'LA LIBERTAD' : 5, 'SAN SALVADOR' : 6, 'CUSCATLAN' : 7, 'LA PAZ' : 8,
          'CABAÑAS' : 9, 'SAN VICENTE' : 10, 'USULUTAN' : 11, 'SAN MIGUEL' : 12,
          'MORAZAN' : 13, 'LA UNION' : 14}

df["source"] = df["dpto_name_student"].map(dptos) - 1
df["target"] = (df["dpto_name_ce"].map(dptos) - 1) + 14

conditions = [
    (df['dpto_name_student'] == 'AHUACHAPAN') & (df['dpto_name_ce'] != 'AHUACHAPAN'),
    (df['dpto_name_student'] == 'AHUACHAPAN') & (df['dpto_name_ce'] == 'AHUACHAPAN'), 
    (df['dpto_name_student'] == 'SANTA ANA') & (df['dpto_name_ce'] != 'SANTA ANA'),
    (df['dpto_name_student'] == 'SANTA ANA') & (df['dpto_name_ce'] == 'SANTA ANA'),
    (df['dpto_name_student'] == 'SONSONATE') & (df['dpto_name_ce'] != 'SONSONATE'),
    (df['dpto_name_student'] == 'SONSONATE') & (df['dpto_name_ce'] == 'SONSONATE'),
    (df['dpto_name_student'] == 'CHALATENANGO') & (df['dpto_name_ce'] != 'CHALATENANGO'),
    (df['dpto_name_student'] == 'CHALATENANGO') & (df['dpto_name_ce'] == 'CHALATENANGO'),
    (df['dpto_name_student'] == 'LA LIBERTAD') & (df['dpto_name_ce'] != 'LA LIBERTAD'),
    (df['dpto_name_student'] == 'LA LIBERTAD') & (df['dpto_name_ce'] == 'LA LIBERTAD'),
    (df['dpto_name_student'] == 'SAN SALVADOR') & (df['dpto_name_ce'] != 'SAN SALVADOR'),
    (df['dpto_name_student'] == 'SAN SALVADOR') & (df['dpto_name_ce'] == 'SAN SALVADOR'),
    (df['dpto_name_student'] == 'CUSCATLAN') & (df['dpto_name_ce'] != 'CUSCATLAN'),
    (df['dpto_name_student'] == 'CUSCATLAN') & (df['dpto_name_ce'] == 'CUSCATLAN'),
    (df['dpto_name_student'] == 'LA PAZ') & (df['dpto_name_ce'] != 'LA PAZ'),
    (df['dpto_name_student'] == 'LA PAZ') & (df['dpto_name_ce'] == 'LA PAZ'),
    (df['dpto_name_student'] == 'CABAÑAS') & (df['dpto_name_ce'] != 'CABAÑAS'),
    (df['dpto_name_student'] == 'CABAÑAS') & (df['dpto_name_ce'] == 'CABAÑAS'),
    (df['dpto_name_student'] == 'SAN VICENTE') & (df['dpto_name_ce'] != 'SAN VICENTE'),
    (df['dpto_name_student'] == 'SAN VICENTE') & (df['dpto_name_ce'] == 'SAN VICENTE'),
    (df['dpto_name_student'] == 'USULUTAN') & (df['dpto_name_ce'] != 'USULUTAN'),
    (df['dpto_name_student'] == 'USULUTAN') & (df['dpto_name_ce'] == 'USULUTAN'),
    (df['dpto_name_student'] == 'SAN MIGUEL') & (df['dpto_name_ce'] != 'SAN MIGUEL'),
    (df['dpto_name_student'] == 'SAN MIGUEL') & (df['dpto_name_ce'] == 'SAN MIGUEL'),
    (df['dpto_name_student'] == 'MORAZAN') & (df['dpto_name_ce'] != 'MORAZAN'),
    (df['dpto_name_student'] == 'MORAZAN') & (df['dpto_name_ce'] == 'MORAZAN'),
    (df['dpto_name_student'] == 'LA UNION') & (df['dpto_name_ce'] != 'LA UNION'),
    (df['dpto_name_student'] == 'LA UNION') & (df['dpto_name_ce'] == 'LA UNION')]

choices = ['#e818a5', '#e4e4e4', 
           '#854cef', '#e4e4e4',
           '#3710f1', '#e4e4e4',
           '#109df1', '#e4e4e4',
           '#0faaa6', '#e4e4e4',
           '#23cf70', '#e4e4e4',
           '#42a62d', '#e4e4e4',
           '#97cc31', '#e4e4e4',
           '#e6ee00', '#e4e4e4',
           '#f9be00', '#e4e4e4',
           '#f99200', '#e4e4e4',
           '#f93a00', '#e4e4e4',
           '#f900df', '#e4e4e4',
           '#a800ff', '#e4e4e4']
df['link colors'] = np.select(conditions, choices, default='#e4e4e4')

df.head()

In [None]:
data_trace = dict(
    type = 'sankey',
    domain = dict(
      x =  [0,1],
      y =  [0,1]
    ),
    orientation = "h",
    valueformat = ".0f",
    node = dict(
      pad = 10,
      thickness = 30,
      line = dict(
        color = "black",
        width = 0
      ),
      label =  ['AHUACHAPAN', 'SANTA ANA', 'SONSONATE', 'CHALATENANGO', 
               'LA LIBERTAD', 'SAN SALVADOR', 'CUSCATLAN', 'LA PAZ',
              'CABAÑAS', 'SAN VICENTE', 'USULUTAN', 'SAN MIGUEL',
              'MORAZAN', 'LA UNION', 
                'AHUACHAPAN', 'SANTA ANA', 'SONSONATE', 'CHALATENANGO', 
               'LA LIBERTAD', 'SAN SALVADOR', 'CUSCATLAN', 'LA PAZ',
              'CABAÑAS', 'SAN VICENTE', 'USULUTAN', 'SAN MIGUEL',
              'MORAZAN', 'LA UNION'],
        
      color = ['#ff791c', '#854cef', '#3710f1', '#109df1', 
               '#fbec00', '#c21e86', '#42a62d', '#97cc31',
               '#e6ee00', '#f9be00', '#f99200', '#f93a00',
               '#f900df', '#a800ff', 
               '#ff791c', '#854cef', '#3710f1', '#109df1', 
               '#fbec00', '#c21e86', '#42a62d', '#97cc31',
               '#e6ee00', '#f9be00', '#f99200', '#f93a00',
               '#f900df', '#a800ff']
    ),
    link = dict(
      source = df['source'].dropna(axis=0, how='any'),
      target = df['target'].dropna(axis=0, how='any'),
      value = df['count'].dropna(axis=0, how='any')
  )
)

layout =  dict(
    title = "Departments of students and schools",
    height = 772,
    font = dict(
      size = 10
    ),    
)

fig = dict(data=[data_trace], layout=layout)
py.iplot(fig, validate=False)

In [None]:
count_students_dpto_diff = sql("""
    select dpto_name_student, dpto_name_ce, count(*)
    from cleaned.students_labelled
    group by dpto_name_student, dpto_name_ce
    having (dpto_name_student != 'None') and (dpto_name_student != 'EXTRANJERO') and (dpto_name_student <> dpto_name_ce);
""")

In [None]:
df = count_students_dpto_diff.copy()

dptos = {'AHUACHAPAN': 1, 'SANTA ANA': 2, 'SONSONATE': 3, 'CHALATENANGO' : 4, 
           'LA LIBERTAD' : 5, 'SAN SALVADOR' : 6, 'CUSCATLAN' : 7, 'LA PAZ' : 8,
          'CABAÑAS' : 9, 'SAN VICENTE' : 10, 'USULUTAN' : 11, 'SAN MIGUEL' : 12,
          'MORAZAN' : 13, 'LA UNION' : 14}

df["source"] = df["dpto_name_student"].map(dptos) - 1
df["target"] = (df["dpto_name_ce"].map(dptos) - 1) + 14


df.head()
df.loc[df["dpto_name_student"] == 'AHUACHAPAN']

In [None]:
data_trace = dict(
    type = 'sankey',
    domain = dict(
      x =  [0,1],
      y =  [0,1]
    ),
    orientation = "h",
    valueformat = ".0f",
    node = dict(
      pad = 10,
      thickness = 30,
      line = dict(
        color = "black",
        width = 0
      ),
      label =  ['AHUACHAPAN', 'SANTA ANA', 'SONSONATE', 'CHALATENANGO', 
               'LA LIBERTAD', 'SAN SALVADOR', 'CUSCATLAN', 'LA PAZ',
              'CABAÑAS', 'SAN VICENTE', 'USULUTAN', 'SAN MIGUEL',
              'MORAZAN', 'LA UNION', 
                'AHUACHAPAN', 'SANTA ANA', 'SONSONATE', 'CHALATENANGO', 
               'LA LIBERTAD', 'SAN SALVADOR', 'CUSCATLAN', 'LA PAZ',
              'CABAÑAS', 'SAN VICENTE', 'USULUTAN', 'SAN MIGUEL',
              'MORAZAN', 'LA UNION'],
        
      color = ['#ff791c', '#854cef', '#3710f1', '#109df1', 
               '#fbec00', '#c21e86', '#42a62d', '#97cc31',
               '#e6ee00', '#f9be00', '#f99200', '#f93a00',
               '#f900df', '#a800ff', 
               '#ff791c', '#854cef', '#3710f1', '#109df1', 
               '#fbec00', '#c21e86', '#42a62d', '#97cc31',
               '#e6ee00', '#f9be00', '#f99200', '#f93a00',
               '#f900df', '#a800ff']
    ),
    link = dict(
      source = df['source'].dropna(axis=0, how='any'),
      target = df['target'].dropna(axis=0, how='any'),
      value = df['count'].dropna(axis=0, how='any')
  )
)

layout =  dict(
    title = "Departments of students and schools",
    height = 772,
    font = dict(
      size = 10
    ),    
)

fig = dict(data=[data_trace], layout=layout)
py.iplot(fig, validate=False)

In [None]:
# Analyze those students that are travelling far away

In [None]:
students_far = sql("""
    select *
    from cleaned.students_labelled
    where (dpto_name_student = 'AHUACHAPAN') and (dpto_name_ce = 'LA UNION');
""")
students_far

In [None]:
sql("""
    select *
    from cleaned.students_labelled
    where nie = 1998058;
""")
