In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import random
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from ydata_profiling import ProfileReport

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
education = pd.read_csv('education.csv')
education.head()

Unnamed: 0,CODIGO,DISTRITO,DEPARTAMENTO,MUNICIPIO,ESTABLECIMIENTO,DIRECCION,TELEFONO,SUPERVISOR,DIRECTOR,NIVEL,SECTOR,AREA,STATUS,MODALIDAD,JORNADA,PLAN,DEPARTAMENTAL
0,16-01-0138-46,16-031,ALTA VERAPAZ,COBAN,COLEGIO COBAN,KM.2 SALIDA A SAN JUAN CHAMELCO ZONA 8,77945104,MERCEDES JOSEFINA TORRES GALVEZ,GUSTAVO ADOLFO SIERRA POP,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
1,16-01-0139-46,16-031,ALTA VERAPAZ,COBAN,COLEGIO PARTICULAR MIXTO VERAPAZ,KM 209.5 ENTRADA A LA CIUDAD,77367402,MERCEDES JOSEFINA TORRES GALVEZ,GILMA DOLORES GUAY PAZ DE LEAL,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
2,16-01-0140-46,16-031,ALTA VERAPAZ,COBAN,"COLEGIO ""LA INMACULADA""",7A. AVENIDA 11-109 ZONA 6,78232301,MERCEDES JOSEFINA TORRES GALVEZ,VIRGINIA SOLANO SERRANO,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
3,16-01-0141-46,16-005,ALTA VERAPAZ,COBAN,ESCUELA NACIONAL DE CIENCIAS COMERCIALES,2A CALLE 11-10 ZONA 2,79514215,RUDY ADOLFO TOT OCH,HɃTOR ROLANDO CHUN POOU,DIVERSIFICADO,OFICIAL,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
4,16-01-0142-46,16-005,ALTA VERAPAZ,COBAN,INSTITUTO NORMAL MIXTO DEL NORTE 'EMILIO ROSAL...,3A AVE 6-23 ZONA 11,79521468,RUDY ADOLFO TOT OCH,VICTOR HUGO DOM͎GUEZ REYES,DIVERSIFICADO,OFICIAL,URBANA,ABIERTA,BILINGUE,VESPERTINA,DIARIO(REGULAR),ALTA VERAPAZ


In [3]:
# replace all ' ' with '_' to make categorical values identifiable for profile report analysis, then convert it to upper case
education['MUNICIPIO'] = education['MUNICIPIO'].str.replace(' ', '_').str.upper()
education['ESTABLECIMIENTO'] = education['ESTABLECIMIENTO'].str.replace(' ', '_').str.upper()
education.head()

Unnamed: 0,CODIGO,DISTRITO,DEPARTAMENTO,MUNICIPIO,ESTABLECIMIENTO,DIRECCION,TELEFONO,SUPERVISOR,DIRECTOR,NIVEL,SECTOR,AREA,STATUS,MODALIDAD,JORNADA,PLAN,DEPARTAMENTAL
0,16-01-0138-46,16-031,ALTA VERAPAZ,COBAN,COLEGIO_COBAN,KM.2 SALIDA A SAN JUAN CHAMELCO ZONA 8,77945104,MERCEDES JOSEFINA TORRES GALVEZ,GUSTAVO ADOLFO SIERRA POP,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
1,16-01-0139-46,16-031,ALTA VERAPAZ,COBAN,COLEGIO_PARTICULAR_MIXTO_VERAPAZ,KM 209.5 ENTRADA A LA CIUDAD,77367402,MERCEDES JOSEFINA TORRES GALVEZ,GILMA DOLORES GUAY PAZ DE LEAL,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
2,16-01-0140-46,16-031,ALTA VERAPAZ,COBAN,"COLEGIO_""LA_INMACULADA""",7A. AVENIDA 11-109 ZONA 6,78232301,MERCEDES JOSEFINA TORRES GALVEZ,VIRGINIA SOLANO SERRANO,DIVERSIFICADO,PRIVADO,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
3,16-01-0141-46,16-005,ALTA VERAPAZ,COBAN,ESCUELA_NACIONAL_DE_CIENCIAS_COMERCIALES,2A CALLE 11-10 ZONA 2,79514215,RUDY ADOLFO TOT OCH,HɃTOR ROLANDO CHUN POOU,DIVERSIFICADO,OFICIAL,URBANA,ABIERTA,MONOLINGUE,MATUTINA,DIARIO(REGULAR),ALTA VERAPAZ
4,16-01-0142-46,16-005,ALTA VERAPAZ,COBAN,INSTITUTO_NORMAL_MIXTO_DEL_NORTE_'EMILIO_ROSAL...,3A AVE 6-23 ZONA 11,79521468,RUDY ADOLFO TOT OCH,VICTOR HUGO DOM͎GUEZ REYES,DIVERSIFICADO,OFICIAL,URBANA,ABIERTA,BILINGUE,VESPERTINA,DIARIO(REGULAR),ALTA VERAPAZ


In [4]:
analysis = ProfileReport(education)
analysis.to_file('analysis.html')

Summarize dataset: 100%|██████████| 27/27 [00:03<00:00,  8.54it/s, Completed]                        
Generate report structure: 100%|██████████| 1/1 [00:08<00:00,  8.31s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 76.21it/s]


In [5]:
education.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9331 entries, 0 to 9330
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CODIGO           9331 non-null   object
 1   DISTRITO         9123 non-null   object
 2   DEPARTAMENTO     9331 non-null   object
 3   MUNICIPIO        9331 non-null   object
 4   ESTABLECIMIENTO  9331 non-null   object
 5   DIRECCION        9280 non-null   object
 6   TELEFONO         8792 non-null   object
 7   SUPERVISOR       9122 non-null   object
 8   DIRECTOR         8434 non-null   object
 9   NIVEL            9331 non-null   object
 10  SECTOR           9331 non-null   object
 11  AREA             9331 non-null   object
 12  STATUS           9331 non-null   object
 13  MODALIDAD        9331 non-null   object
 14  JORNADA          9331 non-null   object
 15  PLAN             9331 non-null   object
 16  DEPARTAMENTAL    9331 non-null   object
dtypes: object(17)
memory usage: 1.2+ 

In [6]:
education.describe

<bound method NDFrame.describe of              CODIGO DISTRITO  DEPARTAMENTO  MUNICIPIO  \
0     16-01-0138-46   16-031  ALTA VERAPAZ      COBAN   
1     16-01-0139-46   16-031  ALTA VERAPAZ      COBAN   
2     16-01-0140-46   16-031  ALTA VERAPAZ      COBAN   
3     16-01-0141-46   16-005  ALTA VERAPAZ      COBAN   
4     16-01-0142-46   16-005  ALTA VERAPAZ      COBAN   
...             ...      ...           ...        ...   
9326  19-09-0040-46   19-021        ZACAPA   LA_UNION   
9327  19-09-0048-46   19-021        ZACAPA   LA_UNION   
9328  19-10-0013-46   19-015        ZACAPA      HUITE   
9329  19-10-1009-46   19-015        ZACAPA      HUITE   
9330  19-11-0018-46   19-020        ZACAPA  SAN_JORGE   

                                        ESTABLECIMIENTO  \
0                                         COLEGIO_COBAN   
1                      COLEGIO_PARTICULAR_MIXTO_VERAPAZ   
2                               COLEGIO_"LA_INMACULADA"   
3              ESCUELA_NACIONAL_DE_CIENCIAS_C