# Julia

## Manipulação de dados
### DataFrames

O que veremos nesse tutorial?

1. leitura de dados
2. selecionar linhas
3. selecionar colunas
4. filtro de linhas
5. groupby com estatísticas
6. renomear colunas 
7. dados faltantes


## 1. Leitura de dados

In [2]:
# importando biblioteca
# using Pkg
# Pkg.add("DataFrames")
# Pkg.add("CSV")
# Pkg.add("Queryverse")
# Pkg.add("BenchmarkTools")
# Pkg.add("Pipe")
using DataFrames
using CSV
using Queryverse
using BenchmarkTools
using Pipe

In [5]:
# ler os dados
sp500 = CSV.read("data/sp500.csv", DataFrame);

### Mensurando tempo de exeução da leitura dos dados

In [7]:
@time CSV.read("data/sp500.csv", DataFrame);

  0.164272 seconds (497.90 k allocations: 45.073 MiB)


In [8]:
@benchmark df = CSV.read("data/sp500.csv", DataFrame)
# @benchmark CSV.read("data/sp500.csv", DataFrame, tasks=8)

BenchmarkTools.Trial: 
  memory estimate:  45.07 MiB
  allocs estimate:  497896
  --------------
  minimum time:     123.096 ms (0.00% GC)
  median time:      131.615 ms (0.00% GC)
  mean time:        133.438 ms (1.39% GC)
  maximum time:     150.121 ms (0.00% GC)
  --------------
  samples:          38
  evals/sample:     1

#### Mostrando as 10 primeiras linhas

In [9]:
first(sp500, 10)

Unnamed: 0_level_0,symbol,date,open,high,low,close,volume
Unnamed: 0_level_1,String,Date…,Float64?,Float64?,Float64?,Float64,Int64
1,AAL,2014-01-02,25.07,25.82,25.06,25.36,8998943
2,AAPL,2014-01-02,79.3828,79.5756,78.8601,79.0185,58791957
3,AAP,2014-01-02,110.36,111.88,109.29,109.74,542711
4,ABBV,2014-01-02,52.12,52.33,51.52,51.98,4569061
5,ABC,2014-01-02,70.11,70.23,69.48,69.89,1148391
6,ABT,2014-01-02,38.09,38.4,38.0,38.23,4967472
7,ACN,2014-01-02,81.5,81.92,81.09,81.13,2405384
8,ADBE,2014-01-02,59.06,59.53,58.94,59.29,2746370
9,ADI,2014-01-02,49.52,49.75,49.04,49.28,2799092
10,ADM,2014-01-02,43.22,43.29,42.79,42.99,2753765


## 2. Selecionar linhas

In [10]:
## selecionando as linhas 1, 10 e 100
sp500[[1, 10, 100],:]

Unnamed: 0_level_0,symbol,date,open,high,low,close,volume
Unnamed: 0_level_1,String,Date…,Float64?,Float64?,Float64?,Float64,Int64
1,AAL,2014-01-02,25.07,25.82,25.06,25.36,8998943
2,ADM,2014-01-02,43.22,43.29,42.79,42.99,2753765
3,CME,2014-01-02,77.96,78.75,77.32,77.96,2070876


In [11]:
## Remover as linhas 1, 10 e 100
sp500[Not([1, 10, 100]), :]

# deleta as linhas inplace
# delete!(sp500, [1, 10, 100])

Unnamed: 0_level_0,symbol,date,open,high,low,close,volume
Unnamed: 0_level_1,String,Date…,Float64?,Float64?,Float64?,Float64,Int64
1,AAPL,2014-01-02,79.3828,79.5756,78.8601,79.0185,58791957
2,AAP,2014-01-02,110.36,111.88,109.29,109.74,542711
3,ABBV,2014-01-02,52.12,52.33,51.52,51.98,4569061
4,ABC,2014-01-02,70.11,70.23,69.48,69.89,1148391
5,ABT,2014-01-02,38.09,38.4,38.0,38.23,4967472
6,ACN,2014-01-02,81.5,81.92,81.09,81.13,2405384
7,ADBE,2014-01-02,59.06,59.53,58.94,59.29,2746370
8,ADI,2014-01-02,49.52,49.75,49.04,49.28,2799092
9,ADP,2014-01-02,80.17,80.45,79.38,79.86,1965869
10,ADSK,2014-01-02,49.33,49.74,48.88,49.25,2488043


## 3. Selecionando colunas

In [13]:
sp500.close
sp500."close"
sp500[!, :close]
sp500[!, "close"]

# se vc quiser fazer uma cópia da coluna
sp500[:, :close] # usar ":" no lugar de "!"

497472-element Array{Float64,1}:
  25.36
  79.0185
 109.74
  51.98
  69.89
  38.23
  81.13
  59.29
  49.28
  42.99
  79.86
  49.25
 262.34
   ⋮
 122.01
  48.11
  67.42
  35.16
  83.64
  65.83
  29.15
  68.2
  81.61
 120.67
  50.83
  72.04

In [18]:
# mais de uma coluna - preferível 
sp500[!, [:symbol, :close, :volume]]

# # alternativa 
# sp500[!, ["symbol", "close", "volume"]]

# # select
#select(sp500, [:symbol, :close, :volume])

# # se quiser fazer o inplace, alterar o sp500 use "!"
#select!(sp500, [:symbol, :close, :volume])

Unnamed: 0_level_0,symbol,close,volume
Unnamed: 0_level_1,String,Float64,Int64
1,AAL,25.36,8998943
2,AAPL,79.0185,58791957
3,AAP,109.74,542711
4,ABBV,51.98,4569061
5,ABC,69.89,1148391
6,ABT,38.23,4967472
7,ACN,81.13,2405384
8,ADBE,59.29,2746370
9,ADI,49.28,2799092
10,ADM,42.99,2753765


In [19]:
## removendo colunas específicas
sp500[!, Not([:high, :open])]

## select
select(sp500, Not([:high, :open]))

Unnamed: 0_level_0,symbol,date,low,close,volume
Unnamed: 0_level_1,String,Date…,Float64?,Float64,Int64
1,AAL,2014-01-02,25.06,25.36,8998943
2,AAPL,2014-01-02,78.8601,79.0185,58791957
3,AAP,2014-01-02,109.29,109.74,542711
4,ABBV,2014-01-02,51.52,51.98,4569061
5,ABC,2014-01-02,69.48,69.89,1148391
6,ABT,2014-01-02,38.0,38.23,4967472
7,ACN,2014-01-02,81.09,81.13,2405384
8,ADBE,2014-01-02,58.94,59.29,2746370
9,ADI,2014-01-02,49.04,49.28,2799092
10,ADM,2014-01-02,42.79,42.99,2753765


## 4. Selecionando linhas a partir de condições

In [23]:
## DataFrames 
# diversas maneiras
sp500[sp500.volume .< 1000, :]

# filter(:volume => volume -> volume<1000, sp500)

# filter(sp500 -> sp500.volume < 1000, sp500)


Unnamed: 0_level_0,symbol,date,open,high,low,close,volume
Unnamed: 0_level_1,String,Date…,Float64?,Float64?,Float64?,Float64,Int64
1,WRK,2015-06-26,missing,missing,missing,61.9,100
2,WRK,2015-06-29,60.92,60.92,60.8,60.8,300
3,DHR,2016-01-12,missing,missing,missing,88.55,0
4,O,2016-01-12,missing,missing,missing,52.43,0
5,UA,2016-04-07,missing,missing,missing,41.56,0
6,FTV,2016-07-01,missing,missing,missing,49.54,0
7,BHF,2017-07-26,missing,missing,missing,69.0842,3
8,BHF,2017-07-28,67.26,67.26,67.26,67.26,101


In [25]:
sp500[(sp500.volume .< 1000).&(sp500.close .> 60),:]

# filter([:volume, :close] => (volume, close) -> volume < 1000 && close > 60, sp500)

Unnamed: 0_level_0,symbol,date,open,high,low,close,volume
Unnamed: 0_level_1,String,Date…,Float64?,Float64?,Float64?,Float64,Int64
1,WRK,2015-06-26,missing,missing,missing,61.9,100
2,WRK,2015-06-29,60.92,60.92,60.8,60.8,300
3,DHR,2016-01-12,missing,missing,missing,88.55,0
4,BHF,2017-07-26,missing,missing,missing,69.0842,3
5,BHF,2017-07-28,67.26,67.26,67.26,67.26,101


#### Usando Queryverse

In [26]:
sp500 |>
    @filter(_.volume < 1000) |>
    DataFrame

Unnamed: 0_level_0,symbol,date,open,high,low,close,volume
Unnamed: 0_level_1,String,Date…,Float64?,Float64?,Float64?,Float64,Int64
1,WRK,2015-06-26,missing,missing,missing,61.9,100
2,WRK,2015-06-29,60.92,60.92,60.8,60.8,300
3,DHR,2016-01-12,missing,missing,missing,88.55,0
4,O,2016-01-12,missing,missing,missing,52.43,0
5,UA,2016-04-07,missing,missing,missing,41.56,0
6,FTV,2016-07-01,missing,missing,missing,49.54,0
7,BHF,2017-07-26,missing,missing,missing,69.0842,3
8,BHF,2017-07-28,67.26,67.26,67.26,67.26,101


In [27]:
sp500 |>
    @filter(_.volume < 1000 && _.close > 60) |>
    DataFrame

Unnamed: 0_level_0,symbol,date,open,high,low,close,volume
Unnamed: 0_level_1,String,Date…,Float64?,Float64?,Float64?,Float64,Int64
1,WRK,2015-06-26,missing,missing,missing,61.9,100
2,WRK,2015-06-29,60.92,60.92,60.8,60.8,300
3,DHR,2016-01-12,missing,missing,missing,88.55,0
4,BHF,2017-07-26,missing,missing,missing,69.0842,3
5,BHF,2017-07-28,67.26,67.26,67.26,67.26,101


### Select 

In [None]:
# Vamos selecionar as colunas que iniciam com v
sp500 |>
    @select(startswith("v")) |>
    DataFrame

In [None]:
# Não vamos selecionar as colunas que iniciam com v
sp500 |>
    @select(!startswith("v")) |>
    DataFrame

In [None]:
# Mais de uma condição
sp500 |>
    @select(startswith("c"), startswith("v")) |>
    DataFrame

## 5. Groupby

#### Calculando estatísticas

In [None]:
using Statistics

In [None]:
## Descrevendo as estatísticas básicas
describe(sp500)

In [None]:
# Estatísticas específicas
describe(sp500, :mean, :std)

In [None]:
#vamos fazer o grupo por symbol e calcular a média do volume
grupos = groupby(sp500, :symbol)
combine(grupos, :volume=>mean)

In [None]:
@pipe sp500 |>
    groupby(_,:symbol) |>
    combine(_,:volume=>mean)

# alterando nome da coluna
@pipe sp500 |>
    groupby(_,:symbol) |>
    combine(_,:volume=>mean=>:media)

In [None]:
# vamos fazer o grupo por symbol e calcular a média do volume
sp500 |>
    @groupby(_.symbol) |> 
    @map({
        Key=key(_),
        media=mean(_.volume)}) |>
    DataFrame

In [None]:
# ordenando de forma crescente
@pipe sp500 |>
    groupby(_,:symbol) |>
    combine(_,:volume=>mean=>:media) |>
    sort(_,:media)

# ordenando de forma decrescente
@pipe sp500 |>
    groupby(_,:symbol) |>
    combine(_,:volume=>mean=>:media) |>
    sort(_,:media, rev=true)

In [None]:
# ordenando de forma crescente
sp500 |>
    @groupby(_.symbol) |> 
    @map({
        symbol=key(_), 
        media=mean(_.volume)}) |>
    @orderby(_.media) |>
    DataFrame

# ordenando de forma decrescente
sp500 |>
    @groupby(_.symbol) |> 
    @map({
        symbol=key(_), 
        media=mean(_.volume)}) |>
    @orderby_descending(_.media) |>
    DataFrame

In [None]:
@pipe sp500 |>
    groupby(_,:symbol) |>
    combine(_,:close=>mean,
              :close=>std)

@pipe sp500 |>
    groupby(_,:symbol) |>
    combine(_,:close=>mean=>:media,
              :close=>std=>:desvio)

In [None]:
estatisticas = sp500 |>
    @groupby(_.symbol) |> 
    @map({Key=key(_), 
          mean=mean(_.close), 
          std=std(_.close)}) |>
    DataFrame 

## 6. Renomear colunas

In [None]:
estatisticas |>
    @rename(:Key => :ticker, 
            :mean => :media, 
            :std => :desvio) |>
    DataFrame

## 7. Dados faltantes

In [None]:
# função ismissing retorna bool
# somar a quantidade de dados faltantes para cada coluna
sum_ismiss(df) = sum(ismissing.(df))
[sum_ismiss(col) for col = eachcol(sp500)]

# usando mapcols
mapcols(sum_ismiss, sp500)

In [None]:
## Missing Data
dropmissing(sp500) # todas as colunas

In [None]:
# dropando missing apenas de open, high
dropmissing(sp500, :high)