In [1]:
import pandas as pd
import numpy as np

## Introdução ao pandas

### Séries

In [8]:
# uma série é um vetor unidimensional contendo uma sequência de valores
# e um vetor de rótulos associados - chamado de índice!
x = pd.Series([12,13,1])

In [9]:
x

0    12
1    13
2     1
dtype: int64

In [10]:
print(type(x.values))
x.values

<class 'numpy.ndarray'>


array([12, 13,  1])

In [11]:
x.index

RangeIndex(start=0, stop=3, step=1)

In [13]:
x = pd.Series([1,2,3], index = ["a", "b", "c"])
x

a    1
b    2
c    3
dtype: int64

In [14]:
x[x>1]

b    2
c    3
dtype: int64

### DataFrame

In [16]:
# DataFrame's representam tabelas retangulares.
# São conjuntos de séries que compartilham o **mesmo** índice
df = pd.DataFrame({
    "hello": [1,2,3],
    "bye": [4,5,6]
})


In [17]:
df

Unnamed: 0,hello,bye
0,1,4
1,2,5
2,3,6


In [18]:
x = pd.Series([1,2,3], index = ["a", "b", "c"])
y = pd.Series([1,2,3], index = ["x", "y", "z"])
df = pd.DataFrame({"col1": x, "col2": y})

In [19]:
# isso é meio inesperado vindo do Rm - em geral as funções do tidyverse 
# descartam os 'índices' ou nomes dos vetores.
df 

Unnamed: 0,col1,col2
a,1.0,
b,2.0,
c,3.0,
x,,1.0
y,,2.0
z,,3.0


### Importação dos dados

#### Lendo arquivos

In [20]:
df = pd.read_csv("../dados/imdb.csv")

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28490 entries, 0 to 28489
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_filme              28490 non-null  object 
 1   titulo                28490 non-null  object 
 2   ano                   28489 non-null  float64
 3   data_lancamento       28490 non-null  object 
 4   generos               28490 non-null  object 
 5   duracao               28490 non-null  int64  
 6   pais                  28490 non-null  object 
 7   idioma                28146 non-null  object 
 8   orcamento             10470 non-null  float64
 9   receita               7698 non-null   float64
 10  receita_eua           7556 non-null   float64
 11  nota_imdb             28490 non-null  float64
 12  num_avaliacoes        28490 non-null  int64  
 13  direcao               28457 non-null  object 
 14  roteiro               28289 non-null  object 
 15  producao           

In [25]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.1/242.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10


In [26]:
df = pd.read_excel("../dados/imdb.xlsx")

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28490 entries, 0 to 28489
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_filme              28490 non-null  object 
 1   titulo                28490 non-null  object 
 2   ano                   28489 non-null  float64
 3   data_lancamento       28490 non-null  object 
 4   generos               28490 non-null  object 
 5   duracao               28490 non-null  int64  
 6   pais                  28490 non-null  object 
 7   idioma                28146 non-null  object 
 8   orcamento             10470 non-null  float64
 9   receita               7698 non-null   float64
 10  receita_eua           7556 non-null   float64
 11  nota_imdb             28490 non-null  float64
 12  num_avaliacoes        28490 non-null  int64  
 13  direcao               28457 non-null  object 
 14  roteiro               28289 non-null  object 
 15  producao           

In [None]:
# é possível ler de muitos outros formatos. veja por exemplo a tabela aqui:
# https://wesmckinney.com/book/accessing-data.html#io_flat_files

In [28]:
# Exercício:
# Carregue os dados provenientes do arquivo dados/imdb.sas7bdat


#### Escrevendo

In [31]:
df.to_csv("../dados/x.csv", index=False)

In [33]:
df.to_pickle("../dados/x.pickle")

### Manipulação de dados

In [None]:
# vamos pensar em manipulação de dados fazendo um paralelo com o dplyr.
# dplyr: 6 verbos principais
# select()    # seleciona colunas do data.frame
# arrange()   # reordena as linhas do data.frame
# filter()    # filtra linhas do data.frame
# mutate()    # cria novas colunas no data.frame (ou atualiza as colunas existentes)
# summarise() + group_by() # sumariza o data.frame
# left_join   # junta dois data.frames

#### select

In [34]:
df = pd.read_csv("../dados/imdb.csv")

In [37]:
df[["titulo"]]

Unnamed: 0,titulo
0,Prestige
1,Nob Hill
2,The Shade
3,Viewer Discretion Advised
4,Broadcast News
...,...
28485,Jumanji: The Next Level
28486,Monster on the Campus
28487,The Ward
28488,A Game of Death


In [39]:
# o filter atua nos índices do DataFrame, podendo ser o índice das
# linhas ou colunas. O padrão é pegar pelas colunas.
df.filter(["titulo"])

Unnamed: 0,titulo
0,Prestige
1,Nob Hill
2,The Shade
3,Viewer Discretion Advised
4,Broadcast News
...,...
28485,Jumanji: The Next Level
28486,Monster on the Campus
28487,The Ward
28488,A Game of Death


In [47]:
df.filter(["titulo", "ano"])

Unnamed: 0,titulo,ano
0,Prestige,1931.0
1,Nob Hill,1945.0
2,The Shade,1999.0
3,Viewer Discretion Advised,1998.0
4,Broadcast News,1987.0
...,...,...
28485,Jumanji: The Next Level,2019.0
28486,Monster on the Campus,1958.0
28487,The Ward,2010.0
28488,A Game of Death,1945.0


In [50]:
# Ao contrário do dplyr, não dá para remover colunas com o filter.
# para isso vc pode usar a drop
df.drop(["ano"], axis = "columns")

Unnamed: 0,id_filme,titulo,data_lancamento,generos,duracao,pais,idioma,orcamento,receita,receita_eua,nota_imdb,num_avaliacoes,direcao,roteiro,producao,elenco,descricao,num_criticas_publico,num_criticas_critica
0,tt0023352,Prestige,1932-01-22,"Adventure, Drama",71,USA,English,,,,5.7,240,Tay Garnett,"Harry Hervey, Tay Garnett",RKO Pathé Pictures,"Ann Harding, Adolphe Menjou, Melvyn Douglas, I...",A woman travels to a French penal colony in In...,12.0,2.0
1,tt0037946,Nob Hill,1945-11-15,"Drama, Musical",95,USA,English,,,,6.3,246,Henry Hathaway,"Wanda Tuchock, Norman Reilly Raine",Twentieth Century Fox,"George Raft, Joan Bennett, Vivian Blaine, Pegg...",The owner of a San Francisco saloon yearns to ...,11.0,2.0
2,tt0216204,The Shade,2000-03-01,Drama,83,USA,English,400000.0,,,7.1,102,Raphaël Nadjari,"Fyodor Dostoevsky, Raphaël Nadjari",Filmaker,"Richard Edson, Lorie Marino, Jeff Ware, Barbar...",,1.0,1.0
3,tt0171889,Viewer Discretion Advised,2012-05-01,"Comedy, Horror",105,USA,English,,,,3.4,111,"Eddie Beverly Jr., Tommy Blaze","Tommy Blaze, Philip Morton",Troma Entertainment,"Ken Donovan, Philip Morton, Caroline Jett, Tom...",Ted Smith becomes trapped in the TV shows he w...,5.0,3.0
4,tt0092699,Broadcast News,1988-04-01,"Comedy, Drama, Romance",133,USA,"English, Spanish, French, German",20000000.0,67331309.0,51249404.0,7.2,26257,James L. Brooks,James L. Brooks,Amercent Films,"William Hurt, Albert Brooks, Holly Hunter, Rob...",Take two rival television reporters: one hands...,142.0,62.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28485,tt7975244,Jumanji: The Next Level,2019-12-25,"Action, Adventure, Comedy",123,USA,English,125000000.0,796575993.0,316831246.0,6.7,168698,Jake Kasdan,"Jake Kasdan, Jeff Pinkner",Matt Tolmach Productions,"Dwayne Johnson, Kevin Hart, Jack Black, Karen ...","In Jumanji: The Next Level, the gang is back b...",940.0,241.0
28486,tt0051948,Monster on the Campus,1958-12-17,"Horror, Sci-Fi",77,USA,English,,,,5.8,1517,Jack Arnold,David Duncan,Universal International Pictures (UI),"Arthur Franz, Joanna Moore, Judson Pratt, Nanc...",The blood of a primitive fish exposed to gamma...,49.0,38.0
28487,tt1369706,The Ward,2011-04-01,"Horror, Mystery, Thriller",89,USA,English,10000000.0,5343820.0,,5.6,39789,John Carpenter,"Michael Rasmussen, Shawn Rasmussen",FilmNation Entertainment,"Amber Heard, Mamie Gummer, Danielle Panabaker,...",An institutionalized young woman becomes terro...,190.0,263.0
28488,tt0038549,A Game of Death,1946-06-07,Adventure,72,USA,English,,,,5.8,302,Robert Wise,"Richard Connell, Norman Houston",RKO Radio Pictures,"John Loder, Audrey Long, Edgar Barrier, Russel...",A remake of Richard Connell's famous short sto...,11.0,10.0


In [51]:
# como não tem non-standard evaluation, vc sempre pode colocar
# o nome numa lista e depois filtrar.
colunas = ["titulo", "ano"]
df.filter(colunas)

Unnamed: 0,titulo,ano
0,Prestige,1931.0
1,Nob Hill,1945.0
2,The Shade,1999.0
3,Viewer Discretion Advised,1998.0
4,Broadcast News,1987.0
...,...,...
28485,Jumanji: The Next Level,2019.0
28486,Monster on the Campus,1958.0
28487,The Ward,2010.0
28488,A Game of Death,1945.0


#### arrange

In [53]:
df.sort_values(["ano"]).filter(["ano"])

Unnamed: 0,ano
12371,1894.0
8008,1912.0
18843,1912.0
20242,1913.0
3723,1914.0
...,...
26345,2020.0
14065,2020.0
7813,2020.0
5659,2020.0


In [54]:
df.sort_values(["ano"], ascending = False).filter(["ano"])

Unnamed: 0,ano
28006,2020.0
18413,2020.0
15465,2020.0
19622,2020.0
3430,2020.0
...,...
20242,1913.0
18843,1912.0
8008,1912.0
12371,1894.0


#### filter

In [58]:
df.query("ano == 2020") # meio esquisito pq a condição está entre aspas...

Unnamed: 0,id_filme,titulo,ano,data_lancamento,generos,duracao,pais,idioma,orcamento,receita,receita_eua,nota_imdb,num_avaliacoes,direcao,roteiro,producao,elenco,descricao,num_criticas_publico,num_criticas_critica
238,tt8430598,Shirley,2020.0,2020-06-05,"Biography, Drama, Thriller",107,USA,English,,75911.0,,6.2,4612,Josephine Decker,"Sarah Gubbins, Susan Scarf Merrell",Los Angeles Media Fund (LAMF),"Elisabeth Moss, Odessa Young, Michael Stuhlbar...",A famous horror writer finds inspiration for h...,75.0,98.0
301,tt7713068,Birds of Prey: And the Fantabulous Emancipatio...,2020.0,2020-02-06,"Action, Adventure, Crime",109,USA,"English, Chinese",84500000.0,201858461.0,84158461.0,6.1,137373,Cathy Yan,"Christina Hodson, Paul Dini",Clubhouse Pictures (II),"Margot Robbie, Rosie Perez, Mary Elizabeth Win...","After splitting with the Joker, Harley Quinn j...",2222.0,372.0
491,tt12158538,Cry Havoc,2020.0,2020-05-05,Horror,85,USA,English,4000000.0,,,3.2,136,Rene Perez,Rene Perez,Samera Entertainment,"J.D. Angstadt, Robert Bronzi, Spring Inés Peña...",A rogue police officer takes on a serial kille...,9.0,23.0


In [60]:
df[df["ano"] == 2020] # muito parecido com base R

Unnamed: 0,id_filme,titulo,ano,data_lancamento,generos,duracao,pais,idioma,orcamento,receita,receita_eua,nota_imdb,num_avaliacoes,direcao,roteiro,producao,elenco,descricao,num_criticas_publico,num_criticas_critica
238,tt8430598,Shirley,2020.0,2020-06-05,"Biography, Drama, Thriller",107,USA,English,,75911.0,,6.2,4612,Josephine Decker,"Sarah Gubbins, Susan Scarf Merrell",Los Angeles Media Fund (LAMF),"Elisabeth Moss, Odessa Young, Michael Stuhlbar...",A famous horror writer finds inspiration for h...,75.0,98.0
301,tt7713068,Birds of Prey: And the Fantabulous Emancipatio...,2020.0,2020-02-06,"Action, Adventure, Crime",109,USA,"English, Chinese",84500000.0,201858461.0,84158461.0,6.1,137373,Cathy Yan,"Christina Hodson, Paul Dini",Clubhouse Pictures (II),"Margot Robbie, Rosie Perez, Mary Elizabeth Win...","After splitting with the Joker, Harley Quinn j...",2222.0,372.0
491,tt12158538,Cry Havoc,2020.0,2020-05-05,Horror,85,USA,English,4000000.0,,,3.2,136,Rene Perez,Rene Perez,Samera Entertainment,"J.D. Angstadt, Robert Bronzi, Spring Inés Peña...",A rogue police officer takes on a serial kille...,9.0,23.0
550,tt5761986,The Orchard,2020.0,2020-03-24,"Crime, Horror, Thriller",81,USA,English,2500000.0,,,3.3,330,Michael Caissie,Michael Caissie,BondIt Media Capital,"Katrina Bowden, Jay Mohr, Will Carlson, Spence...",A sheriff makes a strange discovery when he ge...,16.0,11.0
841,tt8171000,Equal Standard,2020.0,2020-05-14,"Action, Crime, Drama",101,USA,English,,,,5.6,104,Brendan Kyle Cochrane,Taheim Bryan,Digital Seven,"Ice-T, Fredro Starr, Jules Willcox, Robert Clo...",New York City police officers lives collide in...,12.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27983,tt1964995,Penance Lane,2020.0,2020-04-21,Horror,84,USA,,,,,4.8,828,Péter Engert,"Renae Geerlings, Matt Granger",Mane Entertainment,"Tyler Mane, Scout Taylor-Compton, John Schneid...",A hardened criminal fresh out of the joint tak...,33.0,19.0
28006,tt2573372,Becoming,2020.0,2020-03-06,"Drama, Horror, Sci-Fi",98,USA,English,,14285.0,,4.6,650,Omar Naim,Omar Naim,Traverse Media,"Toby Kebbell, Penelope Mitchell, Jeff Daniel P...",A young woman learns her fiance has become pos...,14.0,15.0
28126,tt11766318,Homeward,2020.0,2020-02-25,"Animation, Adventure, Family",81,USA,English,,,,2.5,213,Michael Johnson,"Aaron Witlin, David Michael Latt",The Asylum,"Joey Lawrence, James Cullen Bressack, Kim Litt...",A boastful elf and his prank-happy orc brother...,13.0,1.0
28175,tt11127256,Choke,2020.0,2020-05-10,"Horror, Thriller",73,USA,,,,,5.7,2179,Gregory Hatanaka,Gregory Hatanaka,CineRidge Entertainment,"Shane Ryan, Scott Butler, Sarah Brine, Lisa Lo...","The lines between reality and fiction, and goo...",30.0,21.0
