In [9]:
import pandas as pd
import random

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Funções úteis

Imagine que você tem uma coluna que possui diferentes valores e você gostaria de saber quantas vezes cada valor aparece. Uma solução seria usar um filtro para cada valor e ai então ir vendo a quantidade de linhas. Contudo, para muitos valores isso fica inviável. A solução para isso é usar o value_counts!

### value_counts

Quantos homens e mulheres estavam no titanic?

In [4]:
df[df['Sex'] == 'female'].shape[0]

314

In [5]:
df[df['Sex'] == 'male'].shape[0]

577

In [6]:
df['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [7]:
df['Sex'].value_counts(normalize=True)

male      0.647587
female    0.352413
Name: Sex, dtype: float64

### cumsum

Muitas vezes queremos sabero valor da soma cumulativa, como por exemplo em problemas que envolvem data. Imagine que você queira saber o total que você gastou em um mês, mas queira visualizar isso ao longo dos dias daquele mês. O que você pode fazer é ir somando o quanto você gastou cada dia e ir gerando resultados parciais.

In [10]:
gastos = {"meus_gastos": [random.randint(10, 100) for i in range(1, 31)]}
gastos

{'meus_gastos': [22,
  37,
  54,
  19,
  37,
  56,
  83,
  36,
  21,
  84,
  100,
  50,
  34,
  63,
  100,
  89,
  57,
  21,
  91,
  28,
  48,
  99,
  59,
  100,
  16,
  98,
  45,
  43,
  84,
  29]}

In [12]:
df_meus_gastos = pd.DataFrame(gastos)
df_meus_gastos.head()

Unnamed: 0,meus_gastos
0,22
1,37
2,54
3,19
4,37


In [13]:
df_meus_gastos['meus_gastos'].cumsum()

0       22
1       59
2      113
3      132
4      169
5      225
6      308
7      344
8      365
9      449
10     549
11     599
12     633
13     696
14     796
15     885
16     942
17     963
18    1054
19    1082
20    1130
21    1229
22    1288
23    1388
24    1404
25    1502
26    1547
27    1590
28    1674
29    1703
Name: meus_gastos, dtype: int64

## Manipulação de DataFrames

### Concat

Permite que você concatene dois DataFrames, mas não permite que você selecione uma coluna para usar como chave na hora de juntar, por exemplo. É muito usado para empilhar ou enfileirar DataFrames.

In [14]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [15]:
novo_passageiro = {
    'PassengerId': [892],
    'Survived': [1],
    'Pclass': [3],
    'Name': ['Carrara, Agostinho'],
    'Sex': ['male'],
    'Age': [39],
    'SibSp': [0],
    'Parch': [0],
    'Ticket': [56784],
    'Fare': [7.75],
    'Cabin': ['B12'],
    'Embarked': ['Q']
}
novo_passageiro

{'PassengerId': [892],
 'Survived': [1],
 'Pclass': [3],
 'Name': ['Carrara, Agostinho'],
 'Sex': ['male'],
 'Age': [39],
 'SibSp': [0],
 'Parch': [0],
 'Ticket': [56784],
 'Fare': [7.75],
 'Cabin': ['B12'],
 'Embarked': ['Q']}

In [17]:
df_novo_passageiro = pd.DataFrame(novo_passageiro)
df_novo_passageiro

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,1,3,"Carrara, Agostinho",male,39,0,0,56784,7.75,B12,Q


In [20]:
df_novo = pd.concat([df, df_novo_passageiro], ignore_index = True)

In [22]:
df.shape

(891, 12)

In [21]:
df_novo.shape

(892, 12)

In [23]:
novas_colunas = {
    'coluna1': [i for i in range(df.shape[0])],
    'coluna2': [i + 50 for i in range(df.shape[0])]
}

df_novas_colunas = pd.DataFrame(novas_colunas)
df_novas_colunas.head()

Unnamed: 0,coluna1,coluna2
0,0,50
1,1,51
2,2,52
3,3,53
4,4,54


In [25]:
df.shape

(891, 12)

In [26]:
pd.concat([df, df_novas_colunas], axis=1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,coluna1,coluna2
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0,50
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,51
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,2,52
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,3,53
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,4,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,886,936
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,887,937
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,888,938
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,889,939


### Merge

É como se fosse uma operação de join em banco de dados onde você especifica os DataFrames, quais são as colunas que serão usadas como chave e o tipo de join.

![](./imgs/merge.png)

Imagine que temos uma outra base do titanic, com informações novas.

In [37]:
df_alturas = pd.read_csv('titanic_altura.csv')
df_alturas.head()

Unnamed: 0,PassengerId,Name,Altura
0,1,"Braund, Mr. Owen Harris",162
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",160
2,3,"Heikkinen, Miss. Laina",171
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",188
4,5,"Allen, Mr. William Henry",161


In [42]:
df_alturas.sample(df_alturas.shape[0])

Unnamed: 0,PassengerId,Name,Altura
719,22,"Beesley, Mr. Lawrence",193
584,812,"Lester, Mr. James",171
277,79,"Caldwell, Master. Alden Gates",185
732,338,"Burns, Miss. Elizabeth Margaret",175
409,874,"Vander Cruyssen, Mr. Victor",160
...,...,...,...
245,711,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",183
565,177,"Lefebre, Master. Henry Forbes",160
13,856,"Aks, Mrs. Sam (Leah Rosen)",195
649,391,"Carter, Mr. William Ernest",176


In [38]:
df_alturas = df_alturas.sample(df_alturas.shape[0])
df_alturas = df_alturas.reset_index(drop=True)
df_alturas.head()

Unnamed: 0,PassengerId,Name,Altura
0,62,"Icard, Miss. Amelie",184
1,311,"Hays, Miss. Margaret Bechstein",179
2,310,"Francatelli, Miss. Laura Mabel",187
3,600,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",170
4,607,"Karaic, Mr. Milan",176


In [45]:
df_alturas.iloc[0, 0] = 1

In [46]:
df_alturas.head()

Unnamed: 0,PassengerId,Name,Altura
0,1,"Icard, Miss. Amelie",184
1,311,"Hays, Miss. Margaret Bechstein",179
2,310,"Francatelli, Miss. Laura Mabel",187
3,600,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",170
4,607,"Karaic, Mr. Milan",176


In [47]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [49]:
df_alturas[df_alturas['PassengerId'] == 1]

Unnamed: 0,PassengerId,Name,Altura
0,1,"Icard, Miss. Amelie",184
779,1,"Braund, Mr. Owen Harris",162


In [48]:
pd.merge(left=df, right=df_alturas, on='PassengerId', how='inner')

Unnamed: 0,PassengerId,Survived,Pclass,Name_x,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_y,Altura
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,"Icard, Miss. Amelie",184
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,"Braund, Mr. Owen Harris",162
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,"Cumings, Mrs. John Bradley (Florence Briggs Th...",160
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,"Heikkinen, Miss. Laina",171
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,"Montvila, Rev. Juozas",184
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,"Graham, Miss. Margaret Edith",194
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,"Johnston, Miss. Catherine Helen ""Carrie""",161
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,"Behr, Mr. Karl Howell",189


In [51]:
df_alturas.shape, df.shape

((891, 3), (891, 12))

In [52]:
df_alturas = df_alturas.iloc[1:-9]
df_alturas.shape

(881, 3)

In [56]:
df_merged = pd.merge(left=df, right=df_alturas, on='PassengerId', how = 'left')

In [59]:
df_merged[df_merged['Altura'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name_x,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_y,Altura
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,,
138,139,0,3,"Osen, Mr. Olaf Elon",male,16.0,0,0,7534,9.2167,,S,,
141,142,1,3,"Nysten, Miss. Anna Sofia",female,22.0,0,0,347081,7.75,,S,,
288,289,1,2,"Hosono, Mr. Masabumi",male,42.0,0,0,237798,13.0,,S,,
376,377,1,3,"Landergren, Miss. Aurora Adelia",female,22.0,0,0,C 7077,7.25,,S,,
404,405,0,3,"Oreskovic, Miss. Marija",female,20.0,0,0,315096,8.6625,,S,,
449,450,1,1,"Peuchen, Major. Arthur Godfrey",male,52.0,0,0,113786,30.5,C104,S,,
687,688,0,3,"Dakic, Mr. Branko",male,19.0,0,0,349228,10.1708,,S,,
797,798,1,3,"Osman, Mrs. Mara",female,31.0,0,0,349244,8.6833,,S,,
799,800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",female,30.0,1,1,345773,24.15,,S,,


In [61]:
df_merged = pd.merge(left=df, right=df_alturas, on='PassengerId', how = 'right')
df_merged.shape

(881, 14)

In [62]:
df_alturas.head()

Unnamed: 0,PassengerId,Name,Altura
1,311,"Hays, Miss. Margaret Bechstein",179
2,310,"Francatelli, Miss. Laura Mabel",187
3,600,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",170
4,607,"Karaic, Mr. Milan",176
5,506,"Penasco y Castellana, Mr. Victor de Satode",189


In [63]:
df_alturas.iloc[0, 0] = 999
df_alturas.head()

Unnamed: 0,PassengerId,Name,Altura
1,999,"Hays, Miss. Margaret Bechstein",179
2,310,"Francatelli, Miss. Laura Mabel",187
3,600,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",170
4,607,"Karaic, Mr. Milan",176
5,506,"Penasco y Castellana, Mr. Victor de Satode",189


In [68]:
pd.merge(left=df, right=df_alturas, on='PassengerId', how='outer')

Unnamed: 0,PassengerId,Survived,Pclass,Name_x,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_y,Altura
0,1,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.2500,,S,"Braund, Mr. Owen Harris",162.0
1,2,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C,"Cumings, Mrs. John Bradley (Florence Briggs Th...",160.0
2,3,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.9250,,S,"Heikkinen, Miss. Laina",171.0
3,4,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1000,C123,S,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",188.0
4,5,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.0500,,S,"Allen, Mr. William Henry",161.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,888,1.0,1.0,"Graham, Miss. Margaret Edith",female,19.0,0.0,0.0,112053,30.0000,B42,S,"Graham, Miss. Margaret Edith",194.0
888,889,0.0,3.0,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1.0,2.0,W./C. 6607,23.4500,,S,"Johnston, Miss. Catherine Helen ""Carrie""",161.0
889,890,1.0,1.0,"Behr, Mr. Karl Howell",male,26.0,0.0,0.0,111369,30.0000,C148,C,"Behr, Mr. Karl Howell",189.0
890,891,0.0,3.0,"Dooley, Mr. Patrick",male,32.0,0.0,0.0,370376,7.7500,,Q,"Dooley, Mr. Patrick",166.0


### Groupby

Usado quando queremos extrair informações baseadas em determinados grupos, por exemplo: queremos saber a média de idade por sexo, ou, quero saber quantos sobreviveram por sexo, etc.

In [69]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [70]:
df.groupby(by=['Sex'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001B6FFE963D0>

In [None]:
{
    'female': Dataframe(),
    'male': Dataframe()
}

In [78]:
df.groupby(by=['Sex'])['Survived'].sum()

Sex
female    233
male      109
Name: Survived, dtype: int64

### Pivot

Pivot tables são formas de agregar nossos dados para facilitar a visualização.

![](./imgs/reshaping_pivot.png)

In [79]:
inflacao = {
    "dia_semana": ['segunda', 'terca', 'segunda', 'quarta', 'quinta', 'segunda', 'terca', 'sexta'],
    "produto": ['radio', 'barril', 'iphone11', 'playstation 7', 'airfryer', 'barril', 'freezer', 'caneca'],
    "valor_vendido": [random.randint(50, 1000) for _ in range(8)]
}

In [80]:
df_inflacao = pd.DataFrame(inflacao)
df_inflacao

Unnamed: 0,dia_semana,produto,valor_vendido
0,segunda,radio,482
1,terca,barril,539
2,segunda,iphone11,595
3,quarta,playstation 7,625
4,quinta,airfryer,837
5,segunda,barril,761
6,terca,freezer,534
7,sexta,caneca,341


In [83]:
df_inflacao.pivot(index='dia_semana', columns='produto')

Unnamed: 0_level_0,valor_vendido,valor_vendido,valor_vendido,valor_vendido,valor_vendido,valor_vendido,valor_vendido
produto,airfryer,barril,caneca,freezer,iphone11,playstation 7,radio
dia_semana,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
quarta,,,,,,625.0,
quinta,837.0,,,,,,
segunda,,761.0,,,595.0,,482.0
sexta,,,341.0,,,,
terca,,539.0,,534.0,,,


### Pivot_table

In [84]:
data_url = 'http://bit.ly/2cLzoxH'
df_countries = pd.read_csv(data_url)
df_countries.head()

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap
0,Afghanistan,1952,8425333.0,Asia,28.801,779.445314
1,Afghanistan,1957,9240934.0,Asia,30.332,820.85303
2,Afghanistan,1962,10267083.0,Asia,31.997,853.10071
3,Afghanistan,1967,11537966.0,Asia,34.02,836.197138
4,Afghanistan,1972,13079460.0,Asia,36.088,739.981106


In [85]:
df1 = df_countries[['year', 'continent', 'lifeExp']]
df1.head()

Unnamed: 0,year,continent,lifeExp
0,1952,Asia,28.801
1,1957,Asia,30.332
2,1962,Asia,31.997
3,1967,Asia,34.02
4,1972,Asia,36.088


In [91]:
pd.pivot_table(df1, index=['year'], columns='continent', values='lifeExp')

continent,Africa,Americas,Asia,Europe,Oceania
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1952,39.1355,53.27984,46.314394,64.4085,69.255
1957,41.266346,55.96028,49.318544,66.703067,70.295
1962,43.319442,58.39876,51.563223,68.539233,71.085
1967,45.334538,60.41092,54.66364,69.7376,71.31
1972,47.450942,62.39492,57.319269,70.775033,71.91
1977,49.580423,64.39156,59.610556,71.937767,72.855
1982,51.592865,66.22884,62.617939,72.8064,74.29
1987,53.344788,68.09072,64.851182,73.642167,75.32
1992,53.629577,69.56836,66.537212,74.4401,76.945
1997,53.598269,71.15048,68.020515,75.505167,78.19


In [87]:
df1.groupby(by=['year', 'continent'])['lifeExp'].mean()

year  continent
1952  Africa       39.135500
      Americas     53.279840
      Asia         46.314394
      Europe       64.408500
      Oceania      69.255000
1957  Africa       41.266346
      Americas     55.960280
      Asia         49.318544
      Europe       66.703067
      Oceania      70.295000
1962  Africa       43.319442
      Americas     58.398760
      Asia         51.563223
      Europe       68.539233
      Oceania      71.085000
1967  Africa       45.334538
      Americas     60.410920
      Asia         54.663640
      Europe       69.737600
      Oceania      71.310000
1972  Africa       47.450942
      Americas     62.394920
      Asia         57.319269
      Europe       70.775033
      Oceania      71.910000
1977  Africa       49.580423
      Americas     64.391560
      Asia         59.610556
      Europe       71.937767
      Oceania      72.855000
1982  Africa       51.592865
      Americas     66.228840
      Asia         62.617939
      Europe       72.80640

### Melt

![](./imgs/reshaping_melt.png)

In [92]:
to_melt = {
    "Atendimentos": [60, 10, 100, 50],
    "Nome": ['Jose', 'Maria', 'Felipe', 'Joaquina'],
    "Taxa": [0.7, 0.8, 0.4, 0.3]
}
df_to_melt = pd.DataFrame(to_melt)
df_to_melt

Unnamed: 0,Atendimentos,Nome,Taxa
0,60,Jose,0.7
1,10,Maria,0.8
2,100,Felipe,0.4
3,50,Joaquina,0.3


In [93]:
df_to_melt.melt()

Unnamed: 0,variable,value
0,Atendimentos,60
1,Atendimentos,10
2,Atendimentos,100
3,Atendimentos,50
4,Nome,Jose
5,Nome,Maria
6,Nome,Felipe
7,Nome,Joaquina
8,Taxa,0.7
9,Taxa,0.8


## Transformação de dados

In [None]:
"crianca" -> categorico
2 -> numerico
"adulto" -> categorico
9 -> numerico
10 -> numerico

In [94]:
df['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

### cut

Divide o intervalo total (max - min) por n, sem se preocupar com quantos elementos estarão em cada subintervalo.

In [102]:
grupos_idade = ['crianca', 'adulto', 'idoso']

In [103]:
df['Age categorico'] = pd.cut(df['Age'], bins=3, labels=grupos_idade)

In [105]:
df['Age categorico'].value_counts()

adulto     345
crianca    319
idoso       50
Name: Age categorico, dtype: int64

### qcut

Gera n intervalos de forma a tentar alocar a mesma quantidade de elementos em cada intervalo

In [108]:
df['qcut'] = pd.qcut(df['Age'], q=3, labels=grupos_idade)

In [109]:
df['qcut'].value_counts()

crianca    246
idoso      236
adulto     232
Name: qcut, dtype: int64