In [827]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer

### Amount of rows

In [828]:
education = pd.read_csv('education.csv')

education

Unnamed: 0,CODIGO,DISTRITO,DEPARTAMENTO,MUNICIPIO,ESTABLECIMIENTO,DIRECCION,TELEFONO,SUPERVISOR,DIRECTOR,NIVEL,SECTOR,AREA,STATUS,MODALIDAD,JORNADA,PLAN,DEPARTAMENTAL
0,16-01-0138-46,16-031,ALTA VERAPAZ,COBAN,COLEGIO COBAN,KM.2 SALIDA A SAN JUAN CHAMELCO ZONA 8,77945104,MERCEDES JOSEFINA TORRES GALVEZ,GUSTAVO ADOLFO SIERRA POP,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
1,16-01-0139-46,16-031,ALTA VERAPAZ,COBAN,COLEGIO PARTICULAR MIXTO VERAPAZ,KM 209.5 ENTRADA A LA CIUDAD,77367402,MERCEDES JOSEFINA TORRES GALVEZ,GILMA DOLORES GUAY PAZ DE LEAL,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
2,16-01-0140-46,16-031,ALTA VERAPAZ,COBAN,"COLEGIO ""LA INMACULADA""",7A. AVENIDA 11-109 ZONA 6,78232301,MERCEDES JOSEFINA TORRES GALVEZ,VIRGINIA SOLANO SERRANO,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
3,16-01-0141-46,16-005,ALTA VERAPAZ,COBAN,ESCUELA NACIONAL DE CIENCIAS COMERCIALES,2A CALLE 11-10 ZONA 2,79514215,RUDY ADOLFO TOT OCH,HɃTOR ROLANDO CHUN POOU,DIVERSIFICADO,OFICIAL,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
4,16-01-0142-46,16-005,ALTA VERAPAZ,COBAN,INSTITUTO NORMAL MIXTO DEL NORTE 'EMILIO ROSAL...,3A AVE 6-23 ZONA 11,79521468,RUDY ADOLFO TOT OCH,VICTOR HUGO DOM͎GUEZ REYES,DIVERSIFICADO,OFICIAL,URBANA,ABIERTA,BILINGUE,VESPERTINA,DIARIO(REGULAR),ALTA VERAPAZ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,19-09-0040-46,19-021,ZACAPA,LA UNION,"LICEO PARTICULAR MIXTO ""JIREH""",BARRIO NUEVO,79418369,ASBEL IVAN SUCHITE ARROYO,ANA MAŔ CUELLAR GUERRA,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ZACAPA
9327,19-09-0048-46,19-021,ZACAPA,LA UNION,"LICEO PARTICULAR MIXTO "" JIREH""",BARRIO NUEVO,79418369,ASBEL IVAN SUCHITE ARROYO,ANA MAŔ CUELLAR GUERRA,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,SIN JORNADA,SEMIPRESENCIAL (UN D́ A LA SEMANA),ZACAPA
9328,19-10-0013-46,19-015,ZACAPA,HUITE,INSTITUTO DIVERSIFICADO,BARRIO BUENOS AIRES,47097386,SILDY MARIELA PEREZ FRANCO,MARLON JOSUɠARCHILA LORENZO,DIVERSIFICADO,OFICIAL,URBANA,ABIERTA,MONOLINGUE,NOCTURNA,DIARIO(REGULAR),ZACAPA
9329,19-10-1009-46,19-015,ZACAPA,HUITE,INSTITUTO MIXTO DE EDUCACION DIVERSIFICADA POR...,BARRIO EL CAMPO,55958103,SILDY MARIELA PEREZ FRANCO,ROBIDIO PORTILLO SALGUERO,DIVERSIFICADO,COOPERATIVA,URBANA,ABIERTA,MONOLINGUE,VESPERTINA,DIARIO(REGULAR),ZACAPA


### Amount of columns

In [829]:
education['DISTRITO'] = education['DISTRITO']

## Dataset Transformations

#### DISTRITO

Change: Replace every null value with the first 3 characters of the district to identify in the department

Reason: To keep the dataset as consistent as posible across all values

In [830]:
# Function to replace NaN with first 3 chars of previous entry
def replace_missing_values(df, column_name):
    for i in range(1, len(df)):
        if pd.isna(df.loc[i, column_name]):
            if not pd.isna(df.loc[i-1, column_name]):
                df.loc[i, column_name] = df.loc[i-1, column_name][:3]
    return df

# Replace missing values
df = replace_missing_values(education, 'DISTRITO')

#### DEPARTAMENTO

Change: Replace every space for '_' symbol and replace every lowercase value with uppercase value.

Reason: To avoid issues when accessing values

In [831]:
# Reemplazamos
education['DEPARTAMENTO'] = education['DEPARTAMENTO'].replace(' ', '_', regex=True)
education['DEPARTAMENTO'] = education['DEPARTAMENTO'].str.upper()

#### MUNICIPIO

Change: Replace every space with '_' symbol.

Reason: To avoid issues when accessing values and keep a consistent style across all columns

In [832]:
education['MUNICIPIO'] = education['MUNICIPIO'].replace(' ', '_', regex=True)

#### ESTABLECIMIENTO: 

Replace unknown symbols and writing mistakes

Reason: To avoid misunderstandings with names and avoid same values names to be different

In [833]:
education['ESTABLISHMENT'] = education['ESTABLECIMIENTO']
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('я', 'ÑO', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ӎ', 'ON', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('Ŕ', 'RIA', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ɖ', 'RE', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ɓ', 'E', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('F́', 'FIA', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('Ǵ', 'GICO', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ɒ', 'ER', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ɇ', 'EG', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('Ӈ', 'OG', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('Ƀ', 'EC', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ɘ', '', regex=True)#######
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('R͓', 'RIS', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('L͔', 'LIT', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('U͎', 'UIN', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('T͆', 'TIF', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ӌ', 'OL', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ڂ', 'UB', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ڍ', 'UM', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('R͑', 'RIQ', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ړ', 'US', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('D͎', 'DIN', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('T͎', 'TIN', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('Ӂ', 'OB', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('Ɏ', 'LO', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('J̓', 'PAJ', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('A͓', 'AIS', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('/td>', '', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('Ɍ', 'EM', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('TECPN', 'TECPAN', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ENSEсNZA', 'ENSEÑANZA', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('EDUCACIҎ', 'EDUCACION', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('BSICA', 'BASICA', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('TECNӌOGICO', 'TECNOLOGICO', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('EVANGȌICO', 'EVANGELICO', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('EVANGɌICO', 'EVANGELICO', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('EVAANGELICO', 'EVANGELICO', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('BILING܅', 'BILINGUE', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('TRILING܅', 'TRILINGUE', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ASOCAICION', 'ASOCIACION', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('ASOCIACIɎ', 'ASOCIACION', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('INTITUTO', 'INSTITUTO', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('INFORMTICA', 'INFORMATICA', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('GETSEMAN͢', 'GETSEMANI', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('SINA͢', 'SINA', regex=True)

education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('BERNABɼ', 'BERNABO', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('NGEL', 'ANGEL', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('SARAZځ', 'SARAZO', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('LVAREZ', 'ALVAREZ', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('JOSɠ', 'JOSE', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('JOSɢ', 'JOSE', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.replace('JOSɼ', 'JOSE', regex=True)

Change: Delete quotes, single quotes and hyphen

Reason: To avoid same names with added symbols to create different values

In [834]:
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].replace("'", '', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].replace('-', '', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].replace('"', '', regex=True)

Change: Delete parenthesis and commas

Reason: To avoid same names with added symbols to create different values

In [835]:
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].replace({r'[(),]': ''}, regex=True)

Change: Delete double spaces and change them to underscore

Reason: To avoid issues when accessing data and keep a consistent style across all columns

In [836]:
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].replace('  ', ' ', regex=True)
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].replace(' ', '_', regex=True)

Change: Change everything to uppercase

Reason: Create a consistent style across all columns

In [837]:
education['ESTABLISHMENT'] = education['ESTABLISHMENT'].str.upper()

#### DIRECCIÓN: 

Change: Replace unknown symbols

Reason: To avoid misunderstandings with names and avoid same values names to be different

In [838]:
def replace_dot_with_nan(value):
    if isinstance(value, str) and (value == '.' or value == '-' or value == '--' or value == '---') and len(value) < 4:
        return np.nan
    return value

Change: Fill missing values with 'No Especificado'

Reason: To keep the dataset as consistent as possible across all values

In [839]:
education['DIRECCION'] = education['DIRECCION'].apply(replace_dot_with_nan)
education['DIRECCION'] = education['DIRECCION'].replace(np.nan, 'No Especificado', regex=True)

Change: Replace unknown symbols and writing mistakes

Reason: To avoid misunderstandings with names and avoid same values names to be different

In [840]:
education['DIRECCION'] = education['DIRECCION'].str.replace('я', 'ÑO', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('ӎ', 'ON', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('Ŕ', 'RIA', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('ɖ', 'RE', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('ɓ', 'E', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('F́', 'FIA', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('Ǵ', 'GICO', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('ɒ', 'ER', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('ɇ', 'EG', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('Ӈ', 'OG', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('Ƀ', 'EC', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('ɘ', '', regex=True)#######
education['DIRECCION'] = education['DIRECCION'].str.replace('R͓', 'RIS', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('L͔', 'LIT', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('U͎', 'UIN', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('T͆', 'TIF', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('ӌ', 'OL', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('ڂ', 'UB', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('ڍ', 'UM', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('R͑', 'RIQ', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('ړ', 'US', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('D͎', 'DIN', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('T͎', 'TIN', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('Ӂ', 'OB', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('Ɏ', 'LO', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('J̓', 'PAJ', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('A͓', 'AIS', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('/td>', '', regex=True)
education['DIRECCION'] = education['DIRECCION'].str.replace('Ɍ', 'EM', regex=True)

education['DIRECCION'] = education['DIRECCION'].replace('AVENIDA', 'AV.', regex=True)
education['DIRECCION'] = education['DIRECCION'].replace('AVE.', 'AV.', regex=True)
education['DIRECCION'] = education['DIRECCION'].replace('1RA', '1ERA', regex=True)

In [841]:
# education['DIRECCION'] = education['DIRECCION'].replace("'", '', regex=True)
# education['DIRECCION'] = education['DIRECCION'].replace('-', '', regex=True)
education['DIRECCION'] = education['DIRECCION'].replace('"', '', regex=True)

Change: Delete parenthesis, square brackets and commas

Reason: To avoid same names with added symbols to create different values

In [842]:
education['DIRECCION'] = education['DIRECCION'].replace({r'[(),]': ''}, regex=True)

Change: Delete double spaces and change them to underscore

Reason: To avoid issues when accessing data and keep a consistent style across all columns

In [843]:
education['DIRECCION'] = education['DIRECCION'].replace('  ', ' ', regex=True)
education['DIRECCION'] = education['DIRECCION'].replace(' ', '_', regex=True)

Change: Change everything to uppercase

Reason: Create a consistent style across all columns

In [844]:
education['DIRECCION'] = education['DIRECCION'].str.upper()

#### TELÉFONO

Change: Fill the missing values of numbers that don't get to the standard 8 numbers

Reason: To keep consistency of data across all values

In [845]:
education['TELEFONO'] = education['TELEFONO'].apply(lambda x: np.nan if isinstance(x, str) and len(x) < 8 else x)

Change: Get first and second number in two different columns

Reason: To separate phone numbers and treat them separately

In [846]:
def split_telefono(telefono):
    if pd.isna(telefono) or (isinstance(telefono, str) and len(telefono) < 8):
        return [np.nan, np.nan]
    elif ', ' in telefono:
        parts = telefono.split(', ')
        if len(parts) >= 2:
            if len(parts[0]) == 8 and len(parts[1]) == 8:
                return parts[:2]
            elif len(parts[0]) == 8:
                return [parts[0], np.nan]
            else:
                [telefono, np.nan]
    elif '-' in telefono:
        parts = telefono.split('-')
        if len(parts) >= 2:
            if len(parts[0]) == 8 and len(parts[1]) == 8:
                return parts[:2]
            elif len(parts[0]) == 8:
                return [parts[0], np.nan]
            else:
                [telefono, np.nan]
    elif '/' in telefono:
        parts = telefono.split('/')
        if len(parts) >= 2:
            if len(parts[0]) == 8 and len(parts[1]) == 8:
                return parts[:2]
            elif len(parts[0]) == 8:
                return [parts[0], np.nan]
            else:
                [telefono, np.nan]
    elif ' ' in telefono:
        parts = telefono.split(' ')
        if len(parts) >= 2:
            if len(parts[0]) == 8 and len(parts[1]) == 8:
                return parts[:2]
            elif len(parts[0]) == 8:
                return [parts[0], np.nan]
            else:
                [telefono, np.nan]
    else:
        return [telefono, np.nan]

education[['TELEFONO_PRIMERO', 'TELEFONO_SEGUNDO']] = education['TELEFONO'].apply(split_telefono).apply(pd.Series)


Change: Drop 'TELEFONO' column

Reason: To avoid duplicated values

In [847]:
education.drop(columns=['TELEFONO'], inplace=True)

### SUPERVISOR

Change: Replace null values with 'NO ESPECIFICADO'

Reason: To avoid issues with missing values

In [848]:
education['SUPERVISOR'] = education['SUPERVISOR'].replace(np.nan, 'NO ESPECIFICADO', regex=True)

Change: Replace unknown symbols and writing mistakes

Reason: To avoid misunderstandings with names and avoid same values names to be different

In [849]:
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('я', 'ÑO', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('ӎ', 'ON', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('Ŕ', 'RIA', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('ɖ', 'RE', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('ɓ', 'E', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('F́', 'FIA', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('Ǵ', 'GICO', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('ɒ', 'ER', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('ɇ', 'EG', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('Ӈ', 'OG', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('Ƀ', 'EC', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('ɘ', '', regex=True)#######
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('R͓', 'RIS', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('L͔', 'LIT', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('U͎', 'UIN', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('T͆', 'TIF', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('ӌ', 'OL', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('ڂ', 'UB', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('ڍ', 'UM', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('R͑', 'RIQ', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('ړ', 'US', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('D͎', 'DIN', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('T͎', 'TIN', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('Ӂ', 'OB', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('Ɏ', 'LO', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('J̓', 'PAJ', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('A͓', 'AIS', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('/td>', '', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('Ɍ', 'EM', regex=True)

education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('BERNABɼ', 'BERNABO', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('NGEL', 'ANGEL', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('SARAZځ', 'SARAZO', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('LVAREZ', 'ALVAREZ', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('JOSɠ', 'JOSE', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('JOSɢ', 'JOSE', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].str.replace('JOSɼ', 'JOSE', regex=True)

Change: Delete double spaces and change them to underscore

Reason: To avoid issues when accessing data and avoid same values names to be different

In [850]:
education['SUPERVISOR'] = education['SUPERVISOR'].replace('  ', ' ', regex=True)
education['SUPERVISOR'] = education['SUPERVISOR'].replace(' ', '_', regex=True)

Change: Change everything to uppercase

Reason: Create a consistent style across all columns

In [851]:
education['SUPERVISOR'] = education['SUPERVISOR'].str.upper()

### DIRECTOR

Change: Replace null values with 'NO ESPECIFICADO'

Reason: To avoid issues with missing values

In [852]:
education['DIRECTOR'] = education['DIRECTOR'].replace(np.nan, 'NO ESPECIFICADO', regex=True)

Change: Replace unknown symbols and writing mistakes

Reason: To avoid misunderstandings with names and avoid same values names to be different

In [853]:
education['DIRECTOR'] = education['DIRECTOR'].str.replace('я', 'ÑO', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('ӎ', 'ON', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('Ŕ', 'RIA', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('ɖ', 'RE', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('ɓ', 'E', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('F́', 'FIA', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('Ǵ', 'GICO', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('ɒ', 'ER', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('ɇ', 'EG', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('Ӈ', 'OG', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('Ƀ', 'EC', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('ɘ', '', regex=True)#######
education['DIRECTOR'] = education['DIRECTOR'].str.replace('R͓', 'RIS', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('L͔', 'LIT', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('U͎', 'UIN', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('T͆', 'TIF', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('ӌ', 'OL', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('ڂ', 'UB', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('ڍ', 'UM', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('R͑', 'RIQ', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('ړ', 'US', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('D͎', 'DIN', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('T͎', 'TIN', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('Ӂ', 'OB', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('Ɏ', 'LO', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('J̓', 'PAJ', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('A͓', 'AIS', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('/td>', '', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('Ɍ', 'EM', regex=True)

education['DIRECTOR'] = education['DIRECTOR'].str.replace('BERNABɼ', 'BERNABO', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('NGEL', 'ANGEL', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('SARAZځ', 'SARAZO', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('LVAREZ', 'ALVAREZ', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('JOSɠ', 'JOSE', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('JOSɢ', 'JOSE', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].str.replace('JOSɼ', 'JOSE', regex=True)

Change: Delete double spaces and change them to underscore

Reason: To avoid issues when accessing data and avoid same values names to be different

In [854]:
education['DIRECTOR'] = education['DIRECTOR'].replace('  ', ' ', regex=True)
education['DIRECTOR'] = education['DIRECTOR'].replace(' ', '_', regex=True)

Change: Change everything to uppercase

Reason: Create a consistent style across all columns

In [855]:
education['DIRECTOR'] = education['DIRECTOR'].str.upper()

### PLAN

Change: Replace writing mistakes

Reason: To avoid misunderstandings with names and avoid same values names to be different

In [856]:
education['PLAN'] = education['PLAN'].str.replace('D́ A', 'DIA', regex=True)
education['PLAN'] = education['PLAN'].str.replace('D́S', 'DIAS', regex=True)

Change: Replace unknown symbols and writing mistakes

Reason: Keep consistency of data across all values

In [857]:
education['PLAN'] = education['PLAN'].replace(' ', '_', regex=True)
education['PLAN'] = education['PLAN'].replace('Ɏ', 'EN', regex=True)
education['PLAN'] = education['PLAN'].replace('QUICHɼ/td>', 'QUICHE', regex=True)
education['PLAN'] = education['PLAN'].replace('QUICHɠNORTE', 'QUICHE NORTE', regex=True)
education['PLAN'] = education['PLAN'].replace('SACATEPɑUEZ', 'SACATEPEQUEZ', regex=True)
education['PLAN'] = education['PLAN'].replace('SOLOL|/td>', 'SOLOLA', regex=True)
education['PLAN'] = education['PLAN'].replace('SOLOLA|SOLOLA', 'SOLOLA', regex=True)
education['PLAN'] = education['PLAN'].replace('SOLOLAA|SOLOLAA', 'SOLOLA', regex=True)
education['PLAN'] = education['PLAN'].replace('SUCHITEPɑUEZ', 'SUCHITEPEQUEZ', regex=True)
education['PLAN'] = education['PLAN'].replace('TOTONICAPN', 'TOTONICAPAN', regex=True)

Change: Add 'MADUREZ' column to know if the establishment accepts students by maturity

Reason: To know if the establishment accepts students by maturity easier than looking for it in the name

In [858]:
education['POR_MADUREZ'] = education['ESTABLECIMIENTO'].str.upper().str.contains('MADUREZ')

### Departamental

Change: Replace unknown symbols and writing mistakes

Reason: Keep consistency of data across all values

In [859]:
education['DEPARTAMENTAL'] = education['DEPARTAMENTAL'].replace(' ', '_', regex=True)
education['DEPARTAMENTAL'] = education['DEPARTAMENTAL'].replace('Ɏ', 'EN', regex=True)
education['DEPARTAMENTAL'] = education['DEPARTAMENTAL'].replace('QUICHɼ/td>', 'QUICHE', regex=True)
education['DEPARTAMENTAL'] = education['DEPARTAMENTAL'].replace('QUICHɠNORTE', 'QUICHE NORTE', regex=True)
education['DEPARTAMENTAL'] = education['DEPARTAMENTAL'].replace('SACATEPɑUEZ', 'SACATEPEQUEZ', regex=True)
education['DEPARTAMENTAL'] = education['DEPARTAMENTAL'].replace('SOLOL|/td>', 'SOLOLA', regex=True)
education['DEPARTAMENTAL'] = education['DEPARTAMENTAL'].replace('SOLOLA|SOLOLA', 'SOLOLA', regex=True)
education['DEPARTAMENTAL'] = education['DEPARTAMENTAL'].replace('SOLOLAA|SOLOLAA', 'SOLOLA', regex=True)
education['DEPARTAMENTAL'] = education['DEPARTAMENTAL'].replace('SUCHITEPɑUEZ', 'SUCHITEPEQUEZ', regex=True)
education['DEPARTAMENTAL'] = education['DEPARTAMENTAL'].replace('TOTONICAPN', 'TOTONICAPAN', regex=True)

### Export data to a clean csv

In [860]:
education.to_csv('education_cleaned.csv', index=False)
education

Unnamed: 0,CODIGO,DISTRITO,DEPARTAMENTO,MUNICIPIO,ESTABLECIMIENTO,DIRECCION,SUPERVISOR,DIRECTOR,NIVEL,SECTOR,AREA,STATUS,MODALIDAD,JORNADA,PLAN,DEPARTAMENTAL,ESTABLISHMENT,TELEFONO_PRIMERO,TELEFONO_SEGUNDO,POR_MADUREZ
0,16-01-0138-46,16-031,ALTA_VERAPAZ,COBAN,COLEGIO COBAN,KM.2_SALIDA_A_SAN_JUAN_CHAMELCO_ZONA_8,MERCEDES_JOSEFINA_TORRES_GALVEZ,GUSTAVO_ADOLFO_SIERRA_POP,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA_VERAPAZ,COLEGIO_COBAN,77945104,,False
1,16-01-0139-46,16-031,ALTA_VERAPAZ,COBAN,COLEGIO PARTICULAR MIXTO VERAPAZ,KM_209.5_ENTRADA_A_LA_CIUDAD,MERCEDES_JOSEFINA_TORRES_GALVEZ,GILMA_DOLORES_GUAY_PAZ_DE_LEAL,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA_VERAPAZ,COLEGIO_PARTICULAR_MIXTO_VERAPAZ,77367402,,False
2,16-01-0140-46,16-031,ALTA_VERAPAZ,COBAN,"COLEGIO ""LA INMACULADA""",7A._AV._11-109_ZONA_6,MERCEDES_JOSEFINA_TORRES_GALVEZ,VIRGINIA_SOLANO_SERRANO,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA_VERAPAZ,COLEGIO_LA_INMACULADA,78232301,,False
3,16-01-0141-46,16-005,ALTA_VERAPAZ,COBAN,ESCUELA NACIONAL DE CIENCIAS COMERCIALES,2A_CALLE_11-10_ZONA_2,RUDY_ADOLFO_TOT_OCH,HECTOR_ROLANDO_CHUN_POOU,DIVERSIFICADO,OFICIAL,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA_VERAPAZ,ESCUELA_NACIONAL_DE_CIENCIAS_COMERCIALES,79514215,,False
4,16-01-0142-46,16-005,ALTA_VERAPAZ,COBAN,INSTITUTO NORMAL MIXTO DEL NORTE 'EMILIO ROSAL...,3A_AV.6-23_ZONA_11,RUDY_ADOLFO_TOT_OCH,VICTOR_HUGO_DOM͎GUEZ_REYES,DIVERSIFICADO,OFICIAL,URBANA,ABIERTA,BILINGUE,VESPERTINA,DIARIO(REGULAR),ALTA_VERAPAZ,INSTITUTO_NORMAL_MIXTO_DEL_NORTE_EMILIO_ROSALE...,79521468,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9326,19-09-0040-46,19-021,ZACAPA,LA_UNION,"LICEO PARTICULAR MIXTO ""JIREH""",BARRIO_NUEVO,ASBEL_IVAN_SUCHITE_ARROYO,ANA_MARIA_CUELLAR_GUERRA,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ZACAPA,LICEO_PARTICULAR_MIXTO_JIREH,79418369,,False
9327,19-09-0048-46,19-021,ZACAPA,LA_UNION,"LICEO PARTICULAR MIXTO "" JIREH""",BARRIO_NUEVO,ASBEL_IVAN_SUCHITE_ARROYO,ANA_MARIA_CUELLAR_GUERRA,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,SIN JORNADA,SEMIPRESENCIAL_(UN_DIA_LA_SEMANA),ZACAPA,LICEO_PARTICULAR_MIXTO_JIREH,79418369,,False
9328,19-10-0013-46,19-015,ZACAPA,HUITE,INSTITUTO DIVERSIFICADO,BARRIO_BUENOS_AIRES,SILDY_MARIELA_PEREZ_FRANCO,MARLON_JOSUƓARCHILA_LORENZO,DIVERSIFICADO,OFICIAL,URBANA,ABIERTA,MONOLINGUE,NOCTURNA,DIARIO(REGULAR),ZACAPA,INSTITUTO_DIVERSIFICADO,47097386,,False
9329,19-10-1009-46,19-015,ZACAPA,HUITE,INSTITUTO MIXTO DE EDUCACION DIVERSIFICADA POR...,BARRIO_EL_CAMPO,SILDY_MARIELA_PEREZ_FRANCO,ROBIDIO_PORTILLO_SALGUERO,DIVERSIFICADO,COOPERATIVA,URBANA,ABIERTA,MONOLINGUE,VESPERTINA,DIARIO(REGULAR),ZACAPA,INSTITUTO_MIXTO_DE_EDUCACION_DIVERSIFICADA_POR...,55958103,,False
