# Usando DataFrames

In [1]:
import pandas as pd
import numpy as np

**Criando um dataFrame**<br>
A variável 'df' vai receber um dataFrame de 5 linhas e 4 colunas, gerado por números aleatorios usando o numpy. As linhas são o 'index' e as colunas o 'columns'

In [2]:
# O np.random.randn gera valores aleatórios
df = pd.DataFrame(np.random.randn(5,4), index = 'A B C D E'.split(), columns='W X Y Z'.split())

In [3]:
df

Unnamed: 0,W,X,Y,Z
A,-0.418667,-0.559939,-0.728118,-0.044367
B,0.095032,-0.60094,-0.55204,-0.551755
C,0.918204,1.398687,1.165339,0.587494
D,-0.822955,0.331823,0.288379,-0.562696
E,0.044282,-1.562635,-0.624833,-0.375206


**Seleção**

Selecionado colunas

In [4]:
# Selecionando o a coluna 'X'
df['X']

A   -0.559939
B   -0.600940
C    1.398687
D    0.331823
E   -1.562635
Name: X, dtype: float64

In [5]:
df[['W','Z']]

Unnamed: 0,W,Z
A,-0.418667,-0.044367
B,0.095032,-0.551755
C,0.918204,0.587494
D,-0.822955,-0.562696
E,0.044282,-0.375206


Selecionando linhas

In [6]:
df.loc['A']

W   -0.418667
X   -0.559939
Y   -0.728118
Z   -0.044367
Name: A, dtype: float64

In [7]:
# Selecioando a mesma linha, mas agora com base nos índices
df.iloc[0]

W   -0.418667
X   -0.559939
Y   -0.728118
Z   -0.044367
Name: A, dtype: float64

**Seleção condicional**

In [8]:
# Selecionado as valores maiores 0, mas o retorno é do tipo booleana
df>0

Unnamed: 0,W,X,Y,Z
A,False,False,False,False
B,True,False,False,False
C,True,True,True,True
D,False,True,True,False
E,True,False,False,False


In [9]:
# Retornando apenas os valores maiores 0
df[df>0]

Unnamed: 0,W,X,Y,Z
A,,,,
B,0.095032,,,
C,0.918204,1.398687,1.165339,0.587494
D,,0.331823,0.288379,
E,0.044282,,,


**Criando uma nova coluna**

In [10]:
#A nova coluna vai ser igual a soma da coluna X e Y
df['Nova'] = df['X'] + df['Y']

In [11]:
df

Unnamed: 0,W,X,Y,Z,Nova
A,-0.418667,-0.559939,-0.728118,-0.044367,-1.288057
B,0.095032,-0.60094,-0.55204,-0.551755,-1.15298
C,0.918204,1.398687,1.165339,0.587494,2.564025
D,-0.822955,0.331823,0.288379,-0.562696,0.620202
E,0.044282,-1.562635,-0.624833,-0.375206,-2.187468


**Excluindo a coluna**

In [12]:
df.drop('Nova', axis=1, inplace=True)

In [13]:
df

Unnamed: 0,W,X,Y,Z
A,-0.418667,-0.559939,-0.728118,-0.044367
B,0.095032,-0.60094,-0.55204,-0.551755
C,0.918204,1.398687,1.165339,0.587494
D,-0.822955,0.331823,0.288379,-0.562696
E,0.044282,-1.562635,-0.624833,-0.375206


**Lendo arquivos** <br>
A vamos ler um arquivo como base de dados para este dataframe

In [14]:
df2 = pd.read_csv('iris.csv')

In [15]:
df2

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [16]:
# Tipos dos dados
df2.dtypes

sepal length    float64
sepal width     float64
petal length    float64
petal width     float64
class            object
dtype: object