In [13]:
import os
import yaml
import pandas as pd
from pyathena import connect
from pyathena.util import as_pandas
from sqlalchemy import *

%matplotlib inline
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
pg_cred = yaml.load(open("../conf/local/credentials.yml"), Loader=yaml.FullLoader)["db"]

In [15]:
url = 'postgresql://{}:{}@{}:{}/{}'
url = url.format(pg_cred["pg_user"], pg_cred["pg_pass"], pg_cred["pg_host"], 5432, "iefp")

# The return value of create_engine() is our connection object
con = create_engine(url, client_encoding='utf8')

# We then bind the connection to MetaData()
meta = MetaData(bind=con, reflect=True)

  


In [None]:
sql = """ 
select * from intervencoes
limit 10
;"""

intervencoes = pd.read_sql(sql, con)

In [None]:
intervencoes.iloc[1:10,30:45]

In [None]:
print(intervencoes.columns.values)

In [None]:
sql = """ 
select distinct(f_dcurso) from intervencoes
;"""
dcurso = pd.read_sql(sql, con)

In [None]:
dcurso.shape

In [None]:
sql = """ 
select f_dcurso, count(f_dcurso) as count from intervencoes
group by f_dcurso
order by count DESC
;"""
dcurso_count = pd.read_sql(sql, con)

In [None]:
dcurso_count.shape

In [None]:
dcurso_count.head(10)

In [None]:
dcurso_count.tail(10)

In [None]:
dcurso_count[dcurso_count.count < 50]

In [None]:
sql = """ 
select distinct(f_dmod_form) from intervencoes
;"""
dmod_form = pd.read_sql(sql, con)

In [None]:
dmod_form.shape

In [None]:
dmod_form.head()

In [None]:
sql = """ 
select f_dmod_form, count(f_dmod_form) as count from intervencoes
group by f_dmod_form
order by count DESC
;"""
dmod_form_count = pd.read_sql(sql, con)

In [None]:
dmod_form_count.shape

In [None]:
dmod_form_count.loc[0:10,:]

## Insert intervencoes table using yaml file

In [16]:
int_cols = yaml.load(open("../conf/base/sigae_columns.yml"), Loader=yaml.FullLoader)["intervencoes"]
table = 'intervencoes'

In [17]:
sql = """
select {}
from {}
limit 10000
""".format(', '.join(int_cols), "intervencoes")

df = pd.read_sql(sql, con)

In [18]:
df.head()

Unnamed: 0,ute_id,tipo_utente,centro,tipo_movimento,data_intervencao,codigo_intervencao,cnp_pretendida,area_intervencao,data_resultado,resultado_intervencao,...,f_dcurso,f_horas,f_cmod_form,f_ccurso,f_dmod_form,f_carea_form,f_darea_form,f_vagas,f_vagas_ocupadas,cest_superior
0,3519582.0,A,383.0,25.0,2013-05-23,208.0,413105.0,,NaT,,...,,,,,,,,,,
1,3519582.0,A,383.0,36.0,2013-05-23,208.0,413105.0,,2013-05-23,14.0,...,,,,,,,,,,
2,3519582.0,A,383.0,35.0,2013-05-23,208.0,413105.0,,2013-05-23,1.0,...,,,,,,,,,,
3,4586377.0,A,383.0,26.0,2013-05-23,208.0,832210.0,,NaT,,...,,,,,,,,,,
4,4586377.0,A,383.0,25.0,2013-05-23,208.0,832210.0,,NaT,,...,,,,,,,,,,


In [27]:
df.f_darea_form.head(50)

0                        None
1                        None
2                        None
3                        None
4                        None
5                        None
6                        None
7                        None
8                        None
9                        None
10                       None
11                       None
12                       None
13                       None
14                       None
15                       None
16                       None
17                       None
18                       None
19                       None
20                       None
21                       None
22                       None
23                       None
24                       None
25                       None
26                       None
27                       None
28                       None
29    DESENVOLVIMENTO PESSOAL
30    DESENVOLVIMENTO PESSOAL
31    DESENVOLVIMENTO PESSOAL
32    DESENVOLVIMENTO PESSOAL
33    DESE

In [12]:
df.dtypes

ute_id                          float64
tipo_utente                      object
centro                          float64
tipo_movimento                  float64
data_intervencao         datetime64[ns]
codigo_intervencao              float64
cnp_pretendida                  float64
area_intervencao                float64
data_resultado           datetime64[ns]
resultado_intervencao           float64
qualificacao                    float64
areas_curso                      object
tempo_pratica_ucnp              float64
tipo                             object
tipo_encaminhamento              object
ccentro                         float64
f_dcurso                         object
f_horas                         float64
f_cmod_form                     float64
f_ccurso                         object
f_dmod_form                      object
f_carea_form                    float64
f_darea_form                     object
f_vagas                         float64
f_vagas_ocupadas                float64


## Create function to clean up string columns

In [None]:
# find string cols
df.dtypes

In [None]:
# 1. change to lower case

In [None]:
df_copy = df

In [None]:
df_copy.f_dcurso.str.lower()

In [None]:
# 2. remove characters

In [None]:
df_copy.f_dcurso = df_copy.f_dcurso.str.replace('(|)|-', "", regex = True)
df_copy.f_dcurso

In [None]:
def clean_string(column):
    "function to make string column lower case and remove characters defined in list"
    column = column.str.lower()
    characters= ['(',')','-','+','  ']
    for char in characters:
        column = column.str.replace(char, "", case = False, regex = False)
    
    return column

In [None]:
df_copy = df

In [None]:
# Check columns to see if we need more characters added to list 
cleaned = clean_string(df_copy.f_dcurso)
cleaned.drop_duplicates().to_list()

In [None]:
df_copy.head()

In [None]:
df_copy.dtypes

## Change the resultado_intervencao to time and then take out time

In [None]:
df_copy["data_resultado"].astype("str").head()

In [None]:
test = df_copy["data_resultado"].astype("str")
test.head()

In [None]:
from datetime import datetime
datetime_object = datetime.strptime("2013-05-17 13:02:48","%Y-%m-%d %H:%M:%S")
datetime_object

In [None]:
print(datetime_object.date())

In [None]:
df_copy.data_intervencao.head()

In [None]:
def object_to_date(column, format):
    """converts a object column to a string, and then to a datetime, and then removes the time"""
    column = column.astype("int64").astype("str")
    column = pd.to_datetime(column, errors='coerce', format = format)
    column = column.dt.date
    return column

In [None]:
df_copy = df
from datetime import datetime
date_output = object_to_date(df_copy.data_resultado,"%Y-%m-%d %H:%M:%S")
date_output.head(5)

In [None]:
# Test on other pedidos dates:

In [None]:
sql = """
select *
from pedidos
limit 10000
"""

ped = pd.read_sql(sql, con)

In [None]:
new = ped.ano_mes.astype("int64").astype("str")
new

In [None]:
from datetime import datetime
date_output = object_to_date(ped.ano_mes, "%Y%m")
date_output.head(50)