# Data Exploration

### An Exploratory Data Analysis on the Salvadoran students

In [None]:
%config IPCompleter.greedy=True

import yaml
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import psycopg2
#from os import path
import os
import os.path
import sys
from sqlalchemy import create_engine

pd.set_option("display.max_rows", 50)
pd.set_option('display.max_columns', None)

# Set the path
home_dir = os.getcwd()
credential_dir = os.path.join('/mnt/data/projects/el_salvador_mined_education', 'ana', 'db_credentials')

In [None]:
def create_pgconn(credentials_yaml):
    with open(credentials_yaml) as f:
        configs = yaml.load(f)
    print(configs)
    try: 
        conn = psycopg2.connect("dbname='{}' user='{}' host='{}' password='{}'".format(
            configs['database'],
            configs['user'],
            configs['host'],
            configs['password']))
    except: 
        print("Error connecting to db.")

    cur = conn.cursor()
    cur.execute("SET ROLE " + configs['role'])
    return conn
credentials_yaml = os.path.join(credential_dir, 'avaldivia_elsalvador.yaml') #example file on hitchikers repo
conn = create_pgconn(credentials_yaml)
def sql(query, conn=conn):
    return pd.read_sql(query, conn)

Let's get all the columns from all tables in schema 'cleaned':

### Take a look at student_registration_all

In [None]:
# Take a look to student registration table
total_students = sql("""
    select * 
    from raw.student_registration_all limit 3;
""")
total_students

### Students registration per year

In [None]:
students_year = sql("""
    select anio, count(distinct nie)
    from raw.student_registration_all
    group by anio;
""")
students_year

In [None]:
plt.figure(figsize=(5,5))
students_year_plot = students_year.plot(x = 'anio', y = 'count', title='Total students per year', sharex = False)
students_year_plot.set_xlabel("year")
students_year_plot.grid(True)

### Fixing x axis...

First, check the data types of the columns:

In [None]:
students_year.dtypes

I created a copy of the dataframe so as not to mess with the original data (so I wouldn't have to take the time to reload it again if I screwed up)

In [None]:
testdf = students_year.copy()

Changed the anio column to datetime, and then checked datatypes of columns

In [None]:
testdf.anio = pd.to_datetime(testdf.anio)

In [None]:
testdf.dtypes

replot:

In [None]:
plt.figure(figsize=(5,5))
students_year_plot = testdf.plot(x = 'anio', y = 'count', title='Total students per year', sharex = False)
students_year_plot.set_xlabel("year")
students_year_plot.grid(True)

In [None]:
# Change name columns in registration_media 
total_students_media_new = sql("""
    alter table raw.student_registration_media
    rename column id_departamento to id_departamento_1;
""")