# Pandas

## Estrutura de Dados

### Series

In [1]:
from pandas import Series, DataFrame
import pandas as pd
obj = Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [2]:
print(obj.values)
print(obj.index) #obj.index.values

[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)


In [3]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

Comparado com o numpy array, você pode usar rótulos no índice quando seleciona um único valor ou um conjunto de valores:

In [4]:
obj2["a"]

-5

In [5]:
obj2["d"] = 6 #atribuição

In [6]:
obj2[["c", "a", "d"]] #uma lista de índices

c    3
a   -5
d    6
dtype: int64

In [7]:
import numpy as np
print(obj2[obj2 > 0])
print()
print(obj2 * 2) #OPERAÇÃO vetorizada
print()
print(np.exp(obj2))

d    6
b    7
c    3
dtype: int64

d    12
b    14
a   -10
c     6
dtype: int64

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [8]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

Podemos converter a série num dicionário novamente usando o método `to_dict`:

In [9]:
obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [10]:
states = ['Oregon', 'Texas','California', 'Ohio']
obj4 = Series(sdata, index=states)
obj4

Oregon        16000.0
Texas         71000.0
California        NaN
Ohio          35000.0
dtype: float64

In [11]:
print(pd.isna(obj4)) #isnull
print(pd.notna(obj4)) # notnull

Oregon        False
Texas         False
California     True
Ohio          False
dtype: bool
Oregon         True
Texas          True
California    False
Ohio           True
dtype: bool


In [12]:
print(obj3)
print()
print(obj4)
print()
obj3 + obj4

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

Oregon        16000.0
Texas         71000.0
California        NaN
Ohio          35000.0
dtype: float64



California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [13]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
Oregon        16000.0
Texas         71000.0
California        NaN
Ohio          35000.0
Name: population, dtype: float64

In [14]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

In [15]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


Para grandes DataFrames, podemos usar os métodos `head()`e `tail()`para visualizar apenas uma parte dos dados:

In [16]:
df.head() #mostra as 5 primeiras linhas

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [17]:
df.tail(2) #mostra as 2 últimas linhas

Unnamed: 0,state,year,pop
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [18]:
df2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                index=['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [19]:
df2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [20]:
print(df['state'])
print()
print(df.year)

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64


In [21]:
print(df2.loc['four']) #label
print()
print(df.iloc[0]) #int

year       2001
state    Nevada
pop         2.4
debt        NaN
Name: four, dtype: object

state    Ohio
year     2000
pop       1.5
Name: 0, dtype: object


In [22]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data.loc['Colorado'] #seleciona a linha cujo índice é Colorado

one      4
two      5
three    6
four     7
Name: Colorado, dtype: int32

In [23]:
data.loc[["Colorado", "New York"]] #seleciona as linhas Colorado e New York

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
New York,12,13,14,15


In [24]:
data.loc["Colorado", ["two", "three"]] #seleciona a linha Colorado e as colunas two e three

two      5
three    6
Name: Colorado, dtype: int32

In [25]:
data.iloc[2]#linha 2

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [26]:
data.iloc[[2, 1]] #linhas 2 e 1, nessa ordem

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,4,5,6,7


In [27]:
data.iloc[2, [3, 0, 1]] #linha 2, colunas 3, 0 e 1, nessa ordem

four    11
one      8
two      9
Name: Utah, dtype: int32

In [28]:
data.iloc[[1, 2], [3, 0, 1]] #linhas 1 e 2, colunas 3,0 e 1, nessa ordem

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [29]:
# todas as linhas, as 3 primeiras colunas, desde que seja maior que 5
data.iloc[:, :3][data.three > 5] 

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [30]:
data.loc[data.three >= 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [31]:
df2['debt'] = np.arange(len(df2))
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [32]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
df2['debt'] = val
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [33]:
df2['eastern'] = df2.state == 'Ohio'
df2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [34]:
del df2['eastern']
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [35]:
df2.values # df2.to_numpy()

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

In [36]:
obj = Series(range(3), index=['a', 'b', 'c'])
obj.index.values

array(['a', 'b', 'c'], dtype=object)

In [37]:
print('state' in df2.columns)
print(0 in df.index)

True
True


## Eliminando entradas de um dos eixos

In [38]:
import numpy as np
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [39]:
new_obj = obj.drop("c")
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [40]:
obj.drop(["d", "c"])

a    0.0
b    1.0
e    4.0
dtype: float64

In [41]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [42]:
data.drop(index=["Colorado", "Ohio"]) #linhas

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [43]:
data.drop(columns=["two"]) #colunas

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [44]:
data.drop("two", axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [45]:
data.drop(["two", "four"], axis="columns")

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


## Aplicação de Função e Mapeamento

In [46]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [47]:
list('abc')

['a', 'b', 'c']

In [48]:
df = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df)
print()
np.abs(df) #retorna valor absoluto

               b         d         e
Utah    1.269914 -0.516273  0.640807
Ohio   -1.169284  0.782917 -0.146918
Texas   0.388939  0.078892  0.995161
Oregon -0.273228 -0.197676 -0.325484



Unnamed: 0,b,d,e
Utah,1.269914,0.516273,0.640807
Ohio,1.169284,0.782917,0.146918
Texas,0.388939,0.078892,0.995161
Oregon,0.273228,0.197676,0.325484


In [49]:
f = lambda x: x.max() - x.min()
print(df.apply(f))
print()
print(df.apply(f, axis=1))

b    2.439198
d    1.299190
e    1.320645
dtype: float64

Utah      1.786187
Ohio      1.952201
Texas     0.916269
Oregon    0.127808
dtype: float64


In [50]:
def f2(x):
    return Series([x.min(), x.max()], index=['min', 'max'])

df.apply(f2)

Unnamed: 0,b,d,e
min,-1.169284,-0.516273,-0.325484
max,1.269914,0.782917,0.995161


In [51]:
df

Unnamed: 0,b,d,e
Utah,1.269914,-0.516273,0.640807
Ohio,-1.169284,0.782917,-0.146918
Texas,0.388939,0.078892,0.995161
Oregon,-0.273228,-0.197676,-0.325484


In [52]:
format2 = lambda x: '%.2f' % x
df.applymap(format2)

Unnamed: 0,b,d,e
Utah,1.27,-0.52,0.64
Ohio,-1.17,0.78,-0.15
Texas,0.39,0.08,1.0
Oregon,-0.27,-0.2,-0.33


## Ordenação e Ranking

In [53]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
df2 = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
print(obj)
print()
print(df2)
print()
print(obj.sort_index())
print()
print(df2.sort_index())
print()
print(df2.sort_index(axis=1))

d    0
a    1
b    2
c    3
dtype: int64

       d  a  b  c
three  0  1  2  3
one    4  5  6  7

a    1
b    2
c    3
d    0
dtype: int64

       d  a  b  c
one    4  5  6  7
three  0  1  2  3

       a  b  c  d
three  1  2  3  0
one    5  6  7  4


In [54]:
obj = Series([4, 7, -3, 2])
obj.sort_values(ascending=False) #igual para pandas DataFrame

1    7
0    4
3    2
2   -3
dtype: int64

In [55]:
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'nota': [8, 7, 7.5, 10, 8]}
df4 = DataFrame(data)
print(df4)
print()
df4['rank'] = df4['nota'].rank(ascending=0)
df4.sort_values('rank')

    name  nota
0  Jason   8.0
1  Molly   7.0
2   Tina   7.5
3   Jake  10.0
4    Amy   8.0



Unnamed: 0,name,nota,rank
3,Jake,10.0,1.0
0,Jason,8.0,2.5
4,Amy,8.0,2.5
2,Tina,7.5,4.0
1,Molly,7.0,5.0


## Sumarização e Estatística Descritiva

In [56]:
df5 = DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
print(df5)
print()
print(df5.sum())
print()
print(df5.sum(axis=1))
print()
print(df5.count())

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3

one    9.25
two   -5.80
dtype: float64

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

one    3
two    2
dtype: int64


In [57]:
df5.size

8

In [58]:
df5.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [59]:
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [60]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [61]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [62]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

## Manipulação de Valores Faltantes 

In [63]:
string_data = Series(['laranja', 'uva', np.nan, 'abacate'])
print(string_data)
print()
print(string_data.isnull())
string_data[0] = None
print()
print(string_data.isnull())

0    laranja
1        uva
2        NaN
3    abacate
dtype: object

0    False
1    False
2     True
3    False
dtype: bool

0     True
1    False
2     True
3    False
dtype: bool


In [64]:
data = DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                  [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
print(data)
cleaned = data.dropna()
print('\n',cleaned)
print()
data.dropna(how='all')

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0

      0    1    2
0  1.0  6.5  3.0



Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [65]:
print(data.fillna(0))
print()
print(data.fillna(data.mean()))

     0    1    2
0  1.0  6.5  3.0
1  1.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  6.5  3.0

     0    1    2
0  1.0  6.5  3.0
1  1.0  6.5  3.0
2  1.0  6.5  3.0
3  1.0  6.5  3.0


## TODO Section

### Manipulação de DataFrame

        > Crie, a partir do dicionário abaixo, um DataFrame cujo index seja os valores da variável labels
        > encontre a média dos valores da coluna age e preencha os valores faltantes dessa coluna com o valor da média
        > crie uma nova coluna chamada 'rank', que mostre os animais que receberam mais visitas
        > qual o animal que recebeu a maior quantidade de visitas? Use o método max()

In [66]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [67]:
# TODO: sua resposta aqui
df = pd.DataFrame(data,index=labels)
print("--------------------- Média Coluna Age -------------",'\n')
df['age'] = df['age'].fillna(np.mean(df['age']))
print(df['age'])
print("--------------------- Rank dos animais -------------",'\n')
df['rank'] = df['visits'].rank()
df = df.sort_values(by = 'rank', ascending=False)
print(df.head())
print("--------------------- Quem recebeu mais visitas? --------------------",'\n')
df[df['visits']==(df['visits'].max())]

--------------------- Média Coluna Age ------------- 

a    2.5000
b    3.0000
c    0.5000
d    3.4375
e    5.0000
f    2.0000
g    4.5000
h    3.4375
i    7.0000
j    3.0000
Name: age, dtype: float64
--------------------- Rank dos animais ------------- 

  animal     age  visits priority  rank
b    cat  3.0000       3      yes   9.0
d    dog  3.4375       3      yes   9.0
f    cat  2.0000       3       no   9.0
c  snake  0.5000       2       no   6.0
e    dog  5.0000       2       no   6.0
--------------------- Quem recebeu mais visitas? -------------------- 



Unnamed: 0,animal,age,visits,priority,rank
b,cat,3.0,3,yes,9.0
d,dog,3.4375,3,yes,9.0
f,cat,2.0,3,no,9.0


## Carregamento e Armazenamento de Dados

### Arquivo CSV

In [68]:
import pandas as pd
poke = pd.read_csv('bases/Pokemon.csv')
poke.head(n=10)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False
6,6,Charizard,Fire,Flying,534,78,84,78,109,85,100,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
9,7,Squirtle,Water,,314,44,48,65,50,64,43,1,False


### Arquivo JSON

In [69]:
obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
{"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
print(type(obj))
print(obj)

<class 'str'>

{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
{"name": "Katie", "age": 33, "pet": "Cisco"}]
}



In [70]:
import json
result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}

In [71]:
type(result)

dict

In [72]:
asjson = json.dumps(result)
print(type(asjson))
asjson

<class 'str'>


'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}]}'

In [73]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


## Combinação de Dados

In [74]:
import pandas as pd

In [75]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})

df2 = pd.DataFrame({'key': ['a', 'b', 'd','b'],
                 'data2': range(4)})

pd.merge(df1,df2) #default inner

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,6,1
5,b,6,3
6,a,2,0
7,a,4,0
8,a,5,0


In [76]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})

df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                 'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [77]:
df3

Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [78]:
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


In [79]:
pd.merge(df3, df4, how='outer',left_on='lkey',right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


In [80]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [81]:
import numpy as np
arr = np.arange(12).reshape((3, 4))
print(arr)
print()
np.concatenate([arr, arr], axis=1)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]



array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [82]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [83]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), 
                index=['a', 'b', 'c'],
                columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), 
                index=['a', 'c'],
                columns=['three', 'four'])
print(df1)
print()
print(df2)
print()
pd.concat([df1, df2], axis=1)

   one  two
a    0    1
b    2    3
c    4    5

   three  four
a      5     6
c      7     8



Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


## TODO Section

### Manipulação de Dados usando Pandas

Usando o dataset Pokemon.csv, faça:

    1) Verifique em qual(is) coluna(s) existem valores faltantes
    2) Preencha os valores faltantes da coluna Type 2 com os valores correspondentes da coluna Type 1
    3) Crie um DataFrame a partir dos dados originais contendo apenas pokemons lendários. Imprima os 5 primeiros
    4) Use apply/applymap para passar todos os valores das colunas Name, Type 1 e Type 2 para minúscula

# Resolver os Exercícios até as 20:00h

In [84]:
poke = pd.read_csv('bases/Pokemon.csv')
poke.head(n=10)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False
6,6,Charizard,Fire,Flying,534,78,84,78,109,85,100,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
9,7,Squirtle,Water,,314,44,48,65,50,64,43,1,False


In [85]:
# Resposta 1
print(poke.info()) # forma 1
poke.isnull().sum() # forma 2

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   Type 2      414 non-null    object
 4   Total       800 non-null    int64 
 5   HP          800 non-null    int64 
 6   Attack      800 non-null    int64 
 7   Defense     800 non-null    int64 
 8   Sp. Atk     800 non-null    int64 
 9   Sp. Def     800 non-null    int64 
 10  Speed       800 non-null    int64 
 11  Generation  800 non-null    int64 
 12  Legendary   800 non-null    bool  
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB
None


#               0
Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [86]:
# Resposta 2
poke['Type 2'].fillna(poke['Type 1'],inplace=True)
#poke['Type 2'] = poke['Type 2'].fillna(poke['Type 1'])
poke.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,Fire,309,39,52,43,60,50,65,1,False


In [87]:
# Resposta 3
poke_legendary = poke[poke['Legendary'] == True ]
poke_legendary.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
156,144,Articuno,Ice,Flying,580,90,85,100,95,125,85,1,True
157,145,Zapdos,Electric,Flying,580,90,90,85,125,90,100,1,True
158,146,Moltres,Fire,Flying,580,90,100,90,125,85,90,1,True
162,150,Mewtwo,Psychic,Psychic,680,106,110,90,154,90,130,1,True
163,150,MewtwoMega Mewtwo X,Psychic,Fighting,780,106,190,100,154,100,130,1,True


In [90]:
poke_legendary['Legendary'].unique()

array([ True])

In [89]:
# Resposta 4
#format_str = lambda x: x.lower()
poke[['Name','Type 1','Type 2']] = poke[['Name','Type 1','Type 2']].applymap(lambda x: x.lower())
poke.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,bulbasaur,grass,poison,318,45,49,49,65,65,45,1,False
1,2,ivysaur,grass,poison,405,60,62,63,80,80,60,1,False
2,3,venusaur,grass,poison,525,80,82,83,100,100,80,1,False
3,3,venusaurmega venusaur,grass,poison,625,80,100,123,122,120,80,1,False
4,4,charmander,fire,fire,309,39,52,43,60,50,65,1,False
