# Exploration of School Data

## Import csv into pandas dataframe

In [61]:
import pandas as pd
import numpy as np

df = pd.read_csv('../raw/schools/rendimiento/20160212_Rendimiento_2015_20160131_PUBL.csv', sep=';')
print df.head()

   AGNO  RBD  DGV_RBD                  NOM_RBD  COD_REG_RBD  COD_PRO_RBD  \
0  2015    1        9  LICEO POLITECNICO ARICA           15          151   
1  2015    1        9  LICEO POLITECNICO ARICA           15          151   
2  2015    1        9  LICEO POLITECNICO ARICA           15          151   
3  2015    1        9  LICEO POLITECNICO ARICA           15          151   
4  2015    1        9  LICEO POLITECNICO ARICA           15          151   

   COD_COM_RBD NOM_COM_RBD  COD_DEPROV_RBD NOM_DEPROV_RBD    ...      \
0        15101       ARICA             151          ARICA    ...       
1        15101       ARICA             151          ARICA    ...       
2        15101       ARICA             151          ARICA    ...       
3        15101       ARICA             151          ARICA    ...       
4        15101       ARICA             151          ARICA    ...       

   COD_REG_ALU  COD_COM_ALU  NOM_COM_ALU  COD_RAMA  COD_SEC  COD_ESPE  \
0           15        15101        AR

## Basic Cleaning
Since we're aggregating to the school level we can disregard many of the columns in the raw data.

| Column ID        | Description              |
|------------------|--------------------------|
| MRUN             | Unique ID of student     |
| NOM_RBD          | Name of School           |
| NOM_COM_RBD      | Name of Comuna of school |
| RURAL_RBD        | Rural or urban           |
| GEN_ALU          | Gender of student        |
| PROM_GRAL        | GPA                      |
| ASISTENCIA       | Attendance               |

In [62]:
cols_for_school_agg = ['MRUN', 'NOM_RBD', 'NOM_COM_RBD', 'RURAL_RBD', 'GEN_ALU', 'PROM_GRAL', 'ASISTENCIA']
df_schools = df[cols_for_school_agg]
print df_schools.head()

      MRUN                  NOM_RBD NOM_COM_RBD  RURAL_RBD  GEN_ALU PROM_GRAL  \
0  1073085  LICEO POLITECNICO ARICA       ARICA          0        2       5,3   
1  3940367  LICEO POLITECNICO ARICA       ARICA          0        1       5,2   
2  5555531  LICEO POLITECNICO ARICA       ARICA          0        1         0   
3  7745740  LICEO POLITECNICO ARICA       ARICA          0        2         0   
4  8948160  LICEO POLITECNICO ARICA       ARICA          0        2         0   

   ASISTENCIA  
0          85  
1          85  
2           0  
3           0  
4           0  


## Calculate columns

### Gender
Once we do the aggregation to the level of schools, we'll want to calculate columns of the number students broken out by gender.

| Gender               | Description |
|----------------------|-------------|
| 0                    | unknown     |
| 1                    | male        |
| 2                    | female      |

In [63]:
# drop unknown gender
df_schools = df_schools[df_schools['GEN_ALU'] != 0]
# transform gender to binary
# 0 = male
# 1 = female
df_schools['GEN_ALU'] = df_schools['GEN_ALU'] - 1
df_schools['female_num'] = df_schools['GEN_ALU']
# flip gender (for summing purposes)
# 0 = female
# 1 = male
df_schools['male_num'] = 1 - df_schools['GEN_ALU']
# drop raw gender column
df_schools.drop('GEN_ALU', axis=1, inplace=True)

### GPA
We need to do some simple manipulation here to make this a decimal value

In [64]:
df_schools['PROM_GRAL'] = df_schools['PROM_GRAL'].str.replace(',', '.')
df_schools['PROM_GRAL'] = pd.to_numeric(df_schools['PROM_GRAL'], errors='coerce')
df_schools['PROM_GRAL'] = df['PROM_GRAL'].fillna(df['PROM_GRAL'].mean())

## Perform aggregations
Now we'll do the aggregations, setting a specific function per column.

In [65]:
aggs = {
    'MRUN':'count', 
    "RURAL_RBD": np.mean, 
    "PROM_GRAL":np.mean, 
    "ASISTENCIA":np.mean, 
    "female_num":np.sum, 
    "male_num":np.sum
}
df_schools = df_schools.groupby(["NOM_COM_RBD", "NOM_RBD"]).agg(aggs)
# rename columns
df_schools = df_schools.rename(columns={'MRUN':'num_students', 'PROM_GRAL':'gpa', 'ASISTENCIA':'avg_attendance', 'RURAL_RBD':'urban_rural'})
print df_schools.head()

                                                  male_num       gpa  \
NOM_COM_RBD NOM_RBD                                                    
AISEN       CENTRODE ESTUDIOS PATAGONIA DE AYSEN       188  2.815663   
            COLEGIO KALEM                              361  5.621914   
            COLEGIO SAGRADA FAMILIA                    309  5.295611   
            COLEGIO SANTA TERESA DE LOS ANDES          183  5.514921   
            COLEGIO SINAI                               84  5.385417   

                                                  female_num  num_students  \
NOM_COM_RBD NOM_RBD                                                          
AISEN       CENTRODE ESTUDIOS PATAGONIA DE AYSEN         227           415   
            COLEGIO KALEM                                360           721   
            COLEGIO SAGRADA FAMILIA                      329           638   
            COLEGIO SANTA TERESA DE LOS ANDES            199           382   
            COLEGIO SINAI  

Calculate percentage of male/female students per school.

In [66]:
df_schools['male_pct'] = df_schools['male_num'] / df_schools['num_students']
df_schools['female_pct'] = df_schools['female_num'] / df_schools['num_students']

## Determine correlations

The closer to 0 the correlation, the weaker the connection. The closer to 1, the stronger the positive correlation, and the closer to -1, the stronger the negative correlation. This will help us determine which factors to investigate further.

In [67]:
print df_schools.corr()['gpa']

male_num         -0.035287
gpa               1.000000
female_num        0.030368
num_students     -0.003157
avg_attendance    0.939214
urban_rural       0.172320
male_pct         -0.131191
female_pct        0.131191
Name: gpa, dtype: float64
