# Usando DataFrames

In [71]:
import pandas as pd
import numpy as np

**Criando um dataFrame**<br>
A variável 'df' vai receber um dataFrame de 5 linhas e 4 colunas, gerado por números aleatorios usando o numpy. As linhas são o 'index' e as colunas o 'columns'

In [72]:
# O np.random.randn gera valores aleatórios
df = pd.DataFrame(np.random.randn(5,4), index = 'A B C D E'.split(), columns='W X Y Z'.split())

In [73]:
df

Unnamed: 0,W,X,Y,Z
A,-0.048097,-0.491038,0.926976,-0.094256
B,0.373779,-0.921765,-0.263152,0.677531
C,-1.174026,0.027018,-0.047241,3.077655
D,0.17559,-0.88865,-1.449698,0.995803
E,-0.535595,1.391895,0.033811,-2.147037


**Seleção**

Selecionado colunas

In [74]:
# Selecionando o a coluna 'X'
df['X']

A   -0.491038
B   -0.921765
C    0.027018
D   -0.888650
E    1.391895
Name: X, dtype: float64

In [75]:
df[['W','Z']]

Unnamed: 0,W,Z
A,-0.048097,-0.094256
B,0.373779,0.677531
C,-1.174026,3.077655
D,0.17559,0.995803
E,-0.535595,-2.147037


Selecionando linhas

In [76]:
df.loc['A']

W   -0.048097
X   -0.491038
Y    0.926976
Z   -0.094256
Name: A, dtype: float64

In [77]:
# Selecioando a mesma linha, mas agora com base nos índices
df.iloc[0]

W   -0.048097
X   -0.491038
Y    0.926976
Z   -0.094256
Name: A, dtype: float64

**Seleção condicional**

In [78]:
# Selecionado as valores maiores 0, mas o retorno é do tipo booleana
df>0

Unnamed: 0,W,X,Y,Z
A,False,False,True,False
B,True,False,False,True
C,False,True,False,True
D,True,False,False,True
E,False,True,True,False


In [79]:
# Retornando apenas os valores maiores 0
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,,0.926976,
B,0.373779,,,0.677531
C,,0.027018,,3.077655
D,0.17559,,,0.995803
E,,1.391895,0.033811,


**Criando uma nova coluna**

In [80]:
#A nova coluna vai ser igual a soma da coluna X e Y
df['Nova'] = df['X'] + df['Y']

In [81]:
df

Unnamed: 0,W,X,Y,Z,Nova
A,-0.048097,-0.491038,0.926976,-0.094256,0.435938
B,0.373779,-0.921765,-0.263152,0.677531,-1.184918
C,-1.174026,0.027018,-0.047241,3.077655,-0.020224
D,0.17559,-0.88865,-1.449698,0.995803,-2.338348
E,-0.535595,1.391895,0.033811,-2.147037,1.425706


**Excluindo a coluna**

In [82]:
df.drop('Nova', axis=1, inplace=True)

In [83]:
df

Unnamed: 0,W,X,Y,Z
A,-0.048097,-0.491038,0.926976,-0.094256
B,0.373779,-0.921765,-0.263152,0.677531
C,-1.174026,0.027018,-0.047241,3.077655
D,0.17559,-0.88865,-1.449698,0.995803
E,-0.535595,1.391895,0.033811,-2.147037


**Lendo arquivos** <br>
A vamos ler um arquivo como base de dados para este dataframe

In [84]:
df2 = pd.read_csv('iris.csv')

In [85]:
df2

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [86]:
# Tipos dos dados
df2.dtypes

sepal length    float64
sepal width     float64
petal length    float64
petal width     float64
class            object
dtype: object

Unnamed: 0,W,X,Y,Z
count,5.0,5.0,5.0,5.0
mean,-0.24167,-0.176508,-0.159861,0.501939
std,0.621698,0.956921,0.852314,1.889548
min,-1.174026,-0.921765,-1.449698,-2.147037
25%,-0.535595,-0.88865,-0.263152,-0.094256
50%,-0.048097,-0.491038,-0.047241,0.677531
75%,0.17559,0.027018,0.033811,0.995803
max,0.373779,1.391895,0.926976,3.077655
