# Data Manipulation - Pandas and Numpy
### Series

In [1]:
import pandas as pd
import numpy as np

In [2]:
numeros = pd.Series([1,2,3,4,5,6,7,8,9,10,11,12])

In [3]:
numeros.index

RangeIndex(start=0, stop=12, step=1)

In [4]:
meses = ['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho', 'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro']

In [5]:
meses_rs = pd.Series(np.arange(1, 13), index=meses)

In [6]:
meses_rs

Janeiro       1
Fevereiro     2
Março         3
Abril         4
Maio          5
Junho         6
Julho         7
Agosto        8
Setembro      9
Outubro      10
Novembro     11
Dezembro     12
dtype: int64

In [7]:
meses_rs.index

Index(['Janeiro', 'Fevereiro', 'Março', 'Abril', 'Maio', 'Junho', 'Julho',
       'Agosto', 'Setembro', 'Outubro', 'Novembro', 'Dezembro'],
      dtype='object')

In [8]:
new_meses = pd.Series([meses[x] for x in range(len(meses))], index=np.arange(1, 13))

In [9]:
new_meses

1       Janeiro
2     Fevereiro
3         Março
4         Abril
5          Maio
6         Junho
7         Julho
8        Agosto
9      Setembro
10      Outubro
11     Novembro
12     Dezembro
dtype: object

In [10]:
world_cup = {'Brasil': 5, 'Alemanha': 4, 'Itália': 4, 'Argentina': 2, 'Uruguai': 2, 'Inglaterra': 1, 'Franca': 1}

In [11]:
world_cup

{'Brasil': 5,
 'Alemanha': 4,
 'Itália': 4,
 'Argentina': 2,
 'Uruguai': 2,
 'Inglaterra': 1,
 'Franca': 1}

In [12]:
pd.Series(world_cup)

Brasil        5
Alemanha      4
Itália        4
Argentina     2
Uruguai       2
Inglaterra    1
Franca        1
dtype: int64

In [13]:
world_cup_sr = pd.Series(
    world_cup,
    index = ['Brasil', 'Alemanha', 'Itália', 'Argentina', 'Inglaterra', 'Espanha'],
    name = 'wcSeries')

In [14]:
world_cup_sr

Brasil        5.0
Alemanha      4.0
Itália        4.0
Argentina     2.0
Inglaterra    1.0
Espanha       NaN
Name: wcSeries, dtype: float64

In [15]:
world_cup_sr.Espanha = '1'

In [16]:
world_cup_sr

Brasil        5.0
Alemanha      4.0
Itália        4.0
Argentina     2.0
Inglaterra    1.0
Espanha       1.0
Name: wcSeries, dtype: float64

In [17]:
world_cup_sr['Paraguai'] = 0

In [18]:
world_cup_sr

Brasil        5.0
Alemanha      4.0
Itália        4.0
Argentina     2.0
Inglaterra    1.0
Espanha       1.0
Paraguai      0.0
Name: wcSeries, dtype: float64

In [19]:
path = 'data/Alunos.xlsx'

In [20]:
xls = pd.ExcelFile('dataset/Alunos.xlsx')

In [21]:
xls.sheet_names

['Alunos', 'Notas', 'Trimestre']

In [22]:
alunos = xls.parse('Alunos')
notas = xls.parse('Notas')
trimestre = xls.parse('Trimestre')
alunos.head()

Unnamed: 0,Alunos
0,Ana
1,Clara
2,Célia
3,João
4,Carlos


In [23]:
notas.head()

Unnamed: 0,Alunos,Notas
0,Ana,5.7
1,Clara,9.9
2,Célia,8.0
3,João,9.9
4,Carlos,7.0


In [24]:
trimestre.head()

Unnamed: 0,Alunos,Sobrenome,Mês 01,Mês 02,Mês 03
0,Ana,Santos,5.7,6.8,4.6
1,Clara,Mafra,9.9,7.0,9.0
2,Célia,Tavares,8.0,8.0,9.0
3,João,Nunes,9.9,9.0,8.0
4,Carlos,Guilherme,7.0,7.5,6.0


In [25]:
alunos.Alunos

0        Ana
1      Clara
2      Célia
3       João
4     Carlos
5      Jorge
6     Lilian
7    Antônio
Name: Alunos, dtype: object

In [26]:
trimestre['Mês 01'][0]

5.7

In [27]:
trimestre.index

RangeIndex(start=0, stop=8, step=1)

In [28]:
trimestre.columns

Index(['Alunos', 'Sobrenome', 'Mês 01', 'Mês 02', 'Mês 03'], dtype='object')

In [29]:
'joão' + ' ' + 'Numes'

'joão Numes'

In [30]:
trimestre['Nome Completo'] = trimestre.Alunos + ' ' + trimestre.Sobrenome

In [31]:
trimestre

Unnamed: 0,Alunos,Sobrenome,Mês 01,Mês 02,Mês 03,Nome Completo
0,Ana,Santos,5.7,6.8,4.6,Ana Santos
1,Clara,Mafra,9.9,7.0,9.0,Clara Mafra
2,Célia,Tavares,8.0,8.0,9.0,Célia Tavares
3,João,Nunes,9.9,9.0,8.0,João Nunes
4,Carlos,Guilherme,7.0,7.5,6.0,Carlos Guilherme
5,Jorge,Silva,6.8,7.0,5.0,Jorge Silva
6,Lilian,Vianna,7.8,8.0,8.5,Lilian Vianna
7,Antônio,Costa,7.0,6.5,6.0,Antônio Costa


In [32]:
diarias = pd.read_csv('dataset/Diarias.csv', sep='\t', encoding='latin')

In [33]:
diarias.describe(include=['object'])

Unnamed: 0,Nome Órgão Superior,Nome Órgão Subordinado,Nome Unidade Gestora,Nome Função,Nome Subunção,Nome Programa,Código Ação,Nome Ação,Linguagem Cidadã,CPF Favorecido,Nome Favorecido,Documento Pagamento,Data Pagamento,Valor Pagamento
count,38745,38745,38745,38745,38745,38745,38745,38745,11831,38745,38745,38745,38745,38745
unique,22,224,1454,25,69,76,308,308,26,22994,26160,3034,18,6969
top,MINISTERIO DA EDUCACAO,INSTITUTO NACIONAL DO SEGURO SOCIAL,DIVISAO DE DIARIAS E PASSAGENS DA AGU,Educação,Administração Geral,Educação de qualidade para todos,2000,Administração da Unidade,Administração de unidade,***.018.44*-**,SILVIA OLINDA BIAGI FERRARI,2017OB800072,16/02/2017,6768
freq,10048,3763,663,10045,7900,9511,6668,6668,6668,39,38,147,3946,5005


In [34]:
type(diarias)

pandas.core.frame.DataFrame

In [35]:
diarias.shape

(38745, 21)

In [36]:
diarias.dtypes

Código Órgão Superior        int64
Nome Órgão Superior         object
Código Órgão Subordinado     int64
Nome Órgão Subordinado      object
Código Unidade Gestora       int64
Nome Unidade Gestora        object
Código Função                int64
Nome Função                 object
Código Subfunção             int64
Nome Subunção               object
Código Programa              int64
Nome Programa               object
Código Ação                 object
Nome Ação                   object
Linguagem Cidadã            object
CPF Favorecido              object
Nome Favorecido             object
Documento Pagamento         object
Gestão Pagamento             int64
Data Pagamento              object
Valor Pagamento             object
dtype: object

In [37]:
filmes = pd.read_csv('dataset/movies.csv')

In [38]:
filmes.head()

Unnamed: 0,star rating,title,content rating,genre,duration,actors list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [39]:
filmes.describe()

Unnamed: 0,star rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


In [40]:
filmes.shape

(979, 6)

In [41]:
filmes.dtypes

star rating       float64
title              object
content rating     object
genre              object
duration            int64
actors list        object
dtype: object

In [42]:
filmes.describe(include=[object])

Unnamed: 0,title,content rating,genre,actors list
count,979,976,979,979
unique,975,12,16,969
top,Les Miserables,R,Drama,"[u'Daniel Radcliffe', u'Emma Watson', u'Rupert..."
freq,2,460,278,6


In [43]:
filmes.describe()

Unnamed: 0,star rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


In [44]:
filmes.columns

Index(['star rating', 'title', 'content rating', 'genre', 'duration',
       'actors list'],
      dtype='object')

In [45]:
filmes.columns = 'start_rating', 'title', 'content_rating', 'genre', 'duration', 'actor_list'

In [46]:
filmes.columns

Index(['start_rating', 'title', 'content_rating', 'genre', 'duration',
       'actor_list'],
      dtype='object')

In [47]:
filmes.columns = filmes.columns.str.replace('_', ' ')

In [48]:
filmes.columns

Index(['start rating', 'title', 'content rating', 'genre', 'duration',
       'actor list'],
      dtype='object')

In [49]:
filmes.columns = filmes.columns.str.replace(' ', '_')

In [50]:
filmes.columns

Index(['start_rating', 'title', 'content_rating', 'genre', 'duration',
       'actor_list'],
      dtype='object')

In [51]:
filmes01 = filmes.drop('title', axis=1)

In [52]:
filmes01.head()

Unnamed: 0,start_rating,content_rating,genre,duration,actor_list
0,9.3,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [53]:
filmes01 = filmes.drop(['start_rating', 'content_rating'], axis=1)

In [54]:
filmes01.head()

Unnamed: 0,title,genre,duration,actor_list
0,The Shawshank Redemption,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,The Godfather,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,The Godfather: Part II,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,The Dark Knight,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,Pulp Fiction,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [55]:
filmes.sort_values('start_rating', ascending=False).head()

Unnamed: 0,start_rating,title,content_rating,genre,duration,actor_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [56]:
filmes.sort_values('duration', ascending=False).head()

Unnamed: 0,start_rating,title,content_rating,genre,duration,actor_list
476,7.8,Hamlet,PG-13,Drama,242,"[u'Kenneth Branagh', u'Julie Christie', u'Dere..."
157,8.2,Gone with the Wind,G,Drama,238,"[u'Clark Gable', u'Vivien Leigh', u'Thomas Mit..."
78,8.4,Once Upon a Time in America,R,Crime,229,"[u'Robert De Niro', u'James Woods', u'Elizabet..."
142,8.3,Lagaan: Once Upon a Time in India,PG,Adventure,224,"[u'Aamir Khan', u'Gracy Singh', u'Rachel Shell..."
445,7.9,The Ten Commandments,APPROVED,Adventure,220,"[u'Charlton Heston', u'Yul Brynner', u'Anne Ba..."


In [57]:
film_180 = filmes.duration >= 180

In [58]:
filmes[film_180].head()

Unnamed: 0,start_rating,title,content_rating,genre,duration,actor_list
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."
8,8.9,Schindler's List,R,Biography,195,"[u'Liam Neeson', u'Ralph Fiennes', u'Ben Kings..."
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."
40,8.5,The Green Mile,R,Crime,189,"[u'Tom Hanks', u'Michael Clarke Duncan', u'Dav..."


In [59]:
filmes[filmes.duration >= 200]['genre']

2          Crime
7      Adventure
17         Drama
78         Crime
85     Adventure
142    Adventure
157        Drama
204    Adventure
445    Adventure
476        Drama
630    Biography
767       Action
Name: genre, dtype: object

In [60]:
filmes.loc[filmes.duration >= 200, 'genre']

2          Crime
7      Adventure
17         Drama
78         Crime
85     Adventure
142    Adventure
157        Drama
204    Adventure
445    Adventure
476        Drama
630    Biography
767       Action
Name: genre, dtype: object

In [61]:
for item in filmes.title:
    print(item)

The Shawshank Redemption
The Godfather
The Godfather: Part II
The Dark Knight
Pulp Fiction
12 Angry Men
The Good, the Bad and the Ugly
The Lord of the Rings: The Return of the King
Schindler's List
Fight Club
The Lord of the Rings: The Fellowship of the Ring
Inception
Star Wars: Episode V - The Empire Strikes Back
Forrest Gump
The Lord of the Rings: The Two Towers
Interstellar
One Flew Over the Cuckoo's Nest
Seven Samurai
Goodfellas
Star Wars
The Matrix
City of God
It's a Wonderful Life
The Usual Suspects
Se7en
Life Is Beautiful
Once Upon a Time in the West
The Silence of the Lambs
Leon: The Professional
City Lights
Spirited Away
The Intouchables
Casablanca
Whiplash
American History X
Modern Times
Saving Private Ryan
Raiders of the Lost Ark
Rear Window
Psycho
The Green Mile
Sunset Blvd.
The Pianist
The Dark Knight Rises
Gladiator
Terminator 2: Judgment Day
Memento
Taare Zameen Par
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb
The Departed
Cinema Paradiso
Apocalyp

In [62]:
filmes.title

0                               The Shawshank Redemption
1                                          The Godfather
2                                 The Godfather: Part II
3                                        The Dark Knight
4                                           Pulp Fiction
5                                           12 Angry Men
6                         The Good, the Bad and the Ugly
7          The Lord of the Rings: The Return of the King
8                                       Schindler's List
9                                             Fight Club
10     The Lord of the Rings: The Fellowship of the Ring
11                                             Inception
12        Star Wars: Episode V - The Empire Strikes Back
13                                          Forrest Gump
14                 The Lord of the Rings: The Two Towers
15                                          Interstellar
16                       One Flew Over the Cuckoo's Nest
17                             

In [63]:
for indice, linha in filmes.iterrows():
    print(indice, linha.title, " - ", linha.genre, " - ", linha.start_rating)

0 The Shawshank Redemption  -  Crime  -  9.3
1 The Godfather  -  Crime  -  9.2
2 The Godfather: Part II  -  Crime  -  9.1
3 The Dark Knight  -  Action  -  9.0
4 Pulp Fiction  -  Crime  -  8.9
5 12 Angry Men  -  Drama  -  8.9
6 The Good, the Bad and the Ugly  -  Western  -  8.9
7 The Lord of the Rings: The Return of the King  -  Adventure  -  8.9
8 Schindler's List  -  Biography  -  8.9
9 Fight Club  -  Drama  -  8.9
10 The Lord of the Rings: The Fellowship of the Ring  -  Adventure  -  8.8
11 Inception  -  Action  -  8.8
12 Star Wars: Episode V - The Empire Strikes Back  -  Action  -  8.8
13 Forrest Gump  -  Drama  -  8.8
14 The Lord of the Rings: The Two Towers  -  Adventure  -  8.8
15 Interstellar  -  Adventure  -  8.7
16 One Flew Over the Cuckoo's Nest  -  Drama  -  8.7
17 Seven Samurai  -  Drama  -  8.7
18 Goodfellas  -  Biography  -  8.7
19 Star Wars  -  Action  -  8.7
20 The Matrix  -  Action  -  8.7
21 City of God  -  Crime  -  8.7
22 It's a Wonderful Life  -  Drama  -  8.7
23 T

212 Infernal Affairs  -  Crime  -  8.1
213 There Will Be Blood  -  Drama  -  8.1
214 The Grand Budapest Hotel  -  Adventure  -  8.1
215 La Strada  -  Drama  -  8.1
216 Sin City  -  Crime  -  8.1
217 Memories of Murder  -  Crime  -  8.1
218 Donnie Darko  -  Drama  -  8.1
219 Who's Afraid of Virginia Woolf?  -  Drama  -  8.1
220 Gandhi  -  Biography  -  8.1
221 Solaris  -  Drama  -  8.1
222 Harry Potter and the Deathly Hallows: Part 2  -  Adventure  -  8.1
223 Paris, Texas  -  Drama  -  8.1
224 The Wizard of Oz  -  Adventure  -  8.1
225 3-Iron  -  Crime  -  8.1
226 Boyhood  -  Drama  -  8.1
227 Million Dollar Baby  -  Drama  -  8.1
228 The Last Picture Show  -  Drama  -  8.1
229 Strangers on a Train  -  Crime  -  8.1
230 Cat on a Hot Tin Roof  -  Drama  -  8.1
231 La Dolce Vita  -  Comedy  -  8.1
232 Chungking Express  -  Drama  -  8.1
233 The Night of the Hunter  -  Crime  -  8.1
234 La Haine  -  Crime  -  8.1
235 Yip Man  -  Action  -  8.1
236 High Noon  -  Western  -  8.1
237 Notoriou

432 Crash  -  Drama  -  7.9
433 Avatar  -  Action  -  7.9
434 Iron Man  -  Action  -  7.9
435 Do the Right Thing  -  Comedy  -  7.9
436 Carlito's Way  -  Crime  -  7.9
437 Crouching Tiger, Hidden Dragon  -  Action  -  7.9
438 Lilya 4-Ever  -  Crime  -  7.9
439 The Chorus  -  Drama  -  7.9
440 The Boondock Saints  -  Action  -  7.9
441 Miller's Crossing  -  Crime  -  7.9
442 Walk the Line  -  Biography  -  7.9
443 Shrek  -  Animation  -  7.9
444 My Fair Lady  -  Drama  -  7.9
445 The Ten Commandments  -  Adventure  -  7.9
446 The Fighter  -  Biography  -  7.9
447 Captain Phillips  -  Biography  -  7.9
448 Cabaret  -  Drama  -  7.9
449 Halloween  -  Drama  -  7.9
450 My Left Foot  -  Biography  -  7.9
451 Miracle on 34th Street  -  Comedy  -  7.9
452 The Man from Nowhere  -  Action  -  7.9
453 Toki o kakeru shojo  -  Animation  -  7.9
454 Little Miss Sunshine  -  Adventure  -  7.9
455 Taken  -  Action  -  7.9
456 Blue Is the Warmest Color  -  Drama  -  7.9
457 Boogie Nights  -  Drama  - 

670 Sympathy for Lady Vengeance  -  Crime  -  7.7
671 Grindhouse  -  Action  -  7.7
672 In a Better World  -  Drama  -  7.7
673 Blow-Up  -  Drama  -  7.7
674 The Secret World of Arrietty  -  Animation  -  7.7
675 Kiss Kiss Bang Bang  -  Action  -  7.7
676 Lost Highway  -  Drama  -  7.7
677 Zodiac  -  Crime  -  7.7
678 Les Miserables  -  Drama  -  7.7
679 Moulin Rouge!  -  Drama  -  7.7
680 Whale Rider  -  Drama  -  7.7
681 End of Watch  -  Crime  -  7.7
682 Philomena  -  Biography  -  7.7
683 Fury  -  Action  -  7.7
684 The Big Blue  -  Action  -  7.7
685 First Blood  -  Action  -  7.7
686 Minority Report  -  Action  -  7.7
687 Spellbound  -  Film-Noir  -  7.7
688 The Wicker Man  -  Horror  -  7.7
689 Seven Pounds  -  Drama  -  7.7
690 Midnight Express  -  Biography  -  7.7
691 Kelly's Heroes  -  Action  -  7.7
692 MASH  -  Comedy  -  7.7
693 The Player  -  Comedy  -  7.7
694 Traffic  -  Crime  -  7.7
695 Y Tu Mama Tambien  -  Drama  -  7.7
696 Stranger Than Fiction  -  Comedy  -  7.7


897 Calvary  -  Drama  -  7.5
898 The Mission  -  Adventure  -  7.5
899 In the Loop  -  Comedy  -  7.5
900 The Three Burials of Melquiades Estrada  -  Adventure  -  7.5
901 Babel  -  Drama  -  7.5
902 Frenzy  -  Thriller  -  7.5
903 Heavenly Creatures  -  Biography  -  7.5
904 Sweeney Todd: The Demon Barber of Fleet Street  -  Drama  -  7.5
905 Dracula  -  Horror  -  7.5
906 Looper  -  Action  -  7.5
907 The Proposition  -  Crime  -  7.5
908 Bullitt  -  Action  -  7.5
909 Harry Potter and the Sorcerer's Stone  -  Adventure  -  7.5
910 2046  -  Drama  -  7.5
911 Transamerica  -  Adventure  -  7.5
912 Smoke  -  Comedy  -  7.5
913 Suspiria  -  Horror  -  7.5
914 The Judge  -  Drama  -  7.5
915 Bad Education  -  Crime  -  7.5
916 Up in the Air  -  Drama  -  7.5
917 Begin Again  -  Drama  -  7.5
918 Running Scared  -  Action  -  7.5
919 Ghost World  -  Comedy  -  7.5
920 Witness  -  Crime  -  7.5
921 Trading Places  -  Comedy  -  7.5
922 Mud  -  Drama  -  7.5
923 Across the Universe  -  Dra

In [64]:
filmes.head()

Unnamed: 0,start_rating,title,content_rating,genre,duration,actor_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [65]:
filmes.describe()

Unnamed: 0,start_rating,duration
count,979.0,979.0
mean,7.889785,120.979571
std,0.336069,26.21801
min,7.4,64.0
25%,7.6,102.0
50%,7.8,117.0
75%,8.1,134.0
max,9.3,242.0


In [66]:
filmes.describe(include='all')

Unnamed: 0,start_rating,title,content_rating,genre,duration,actor_list
count,979.0,979,976,979,979.0,979
unique,,975,12,16,,969
top,,Les Miserables,R,Drama,,"[u'Daniel Radcliffe', u'Emma Watson', u'Rupert..."
freq,,2,460,278,,6
mean,7.889785,,,,120.979571,
std,0.336069,,,,26.21801,
min,7.4,,,,64.0,
25%,7.6,,,,102.0,
50%,7.8,,,,117.0,
75%,8.1,,,,134.0,


In [67]:
filmes.head()

Unnamed: 0,start_rating,title,content_rating,genre,duration,actor_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [68]:
filmes.columns = filmes.columns.str.replace('actor_list', 'actors_list')

In [69]:
dramas_longos = filmes[(filmes.duration >= 200) & (filmes.genre == 'Drama')]

In [70]:
dramas_longos

Unnamed: 0,start_rating,title,content_rating,genre,duration,actors_list
17,8.7,Seven Samurai,UNRATED,Drama,207,"[u'Toshir\xf4 Mifune', u'Takashi Shimura', u'K..."
157,8.2,Gone with the Wind,G,Drama,238,"[u'Clark Gable', u'Vivien Leigh', u'Thomas Mit..."
476,7.8,Hamlet,PG-13,Drama,242,"[u'Kenneth Branagh', u'Julie Christie', u'Dere..."


In [71]:
file = 'dataset/drinks.csv'

In [72]:
drinks = pd.read_csv(file)

In [73]:
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
population                       object
continent                        object
dtype: object

In [74]:
drinks['beer_servings'] = drinks.beer_servings.astype(float)

In [75]:
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
population                       object
continent                        object
dtype: object

## others types 

In [76]:
vendas = pd.read_csv('dataset/Sanduiches.txt', sep='\t')

In [77]:
vendas.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [78]:
vendas.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

In [79]:
sale = vendas

In [80]:
sale.item_price.str.replace('$', '').astype(float).mean()

7.464335785374297

In [81]:
sale.item_name.str.contains('Chicken').head()

0    False
1    False
2    False
3    False
4     True
Name: item_name, dtype: bool

In [82]:
sale.item_name.str.contains('Chicken').head().astype(int)

0    0
1    0
2    0
3    0
4    1
Name: item_name, dtype: int64

In [83]:
sale[sale.item_name.str.contains('Chicken')].head(5)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
11,6,1,Chicken Crispy Tacos,"[Roasted Chili Corn Salsa, [Fajita Vegetables,...",$8.75
12,6,1,Chicken Soft Tacos,"[Roasted Chili Corn Salsa, [Rice, Black Beans,...",$8.75
13,7,1,Chicken Bowl,"[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...",$11.25


In [84]:
sale.item_name.str.contains('Tomato').astype(int).sum()

111

In [85]:
sale.item_name.str.contains('Coke').astype(int).sum()

0

In [None]:
#sale.choice_description.str.contains('Coke').astype(int).sum()

# gera um erro, pois não permite converter NaN "none a number" que é um valor nulo em inteiro
# com isso deve-se tratar os dados NaN

# primeiro vamos entender o que é NaN e coomo fazer em diversos exemplos, sepois voltamos aqui


# Trabalhando com valores nulos NaN

In [87]:
ufo = pd.read_csv('dataset/ufo.csv')

In [88]:
ufo.tail()      #  os dados de 'Colors Reported' e 'Shape Reported' estão Nulos, o que fazer?

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,Grant Park,,TRIANGLE,IL,12/31/2000 23:00
18237,Spirit Lake,,DISK,IA,12/31/2000 23:00
18238,Eagle River,,,WI,12/31/2000 23:45
18239,Eagle River,RED,LIGHT,WI,12/31/2000 23:45
18240,Ybor,,OVAL,FL,12/31/2000 23:59


In [89]:
ufo.isnull().tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,False,True,False,False,False
18237,False,True,False,False,False
18238,False,True,True,False,False
18239,False,False,False,False,False
18240,False,True,False,False,False


In [91]:
ufo.isnull().sum()

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [92]:
ufo[ufo.City.isnull()].head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
21,,,,LA,8/15/1943 0:00
22,,,LIGHT,LA,8/15/1943 0:00
204,,,DISK,CA,7/15/1952 12:30
241,,BLUE,DISK,MT,7/4/1953 14:00
613,,,DISK,NV,7/1/1960 12:00


In [97]:
a = ufo.shape

In [98]:
a

(18241, 5)

In [99]:
b = ufo.dropna(how='any').shape # any se qualquer coluna for nula

In [100]:
b

(2486, 5)

In [101]:
ufo.dropna(how='all').shape # se todas as colunas forem nulas

(18241, 5)

In [103]:
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape

(15576, 5)

In [104]:
ufo.dropna(subset=['City', 'Shape Reported'], how='all').shape

(18237, 5)

In [107]:
ufo['Shape Reported'].value_counts().head() # não conta NaN

LIGHT       2803
DISK        2122
TRIANGLE    1889
OTHER       1402
CIRCLE      1365
Name: Shape Reported, dtype: int64

In [108]:
ufo['Shape Reported'].value_counts(dropna=False).head() # dropna inclui NaN

LIGHT       2803
NaN         2644
DISK        2122
TRIANGLE    1889
OTHER       1402
Name: Shape Reported, dtype: int64

In [110]:
ufo['Shape Reported'].fillna(value='Diversos', inplace=True)

In [114]:
ufo['Shape Reported'].value_counts(dropna=False)

LIGHT        2803
Diversos     2644
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
VARIOUS       333
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
ROUND           2
CRESCENT        2
PYRAMID         1
FLARE           1
HEXAGON         1
DOME            1
Name: Shape Reported, dtype: int64

# voltando para o sanduiche.txt aonde tem valores nulos

In [115]:
sale.item_name.str.contains('Coke').astype(int).sum()

0

In [119]:
sale.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [139]:
sale.shape

(4622, 5)

In [135]:
order = sale.dropna(subset=['choice_description'], how='any')

In [136]:
order.shape

(3376, 5)

In [137]:
order.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75


In [140]:
coke = order['choice_description'].str.contains('Coke').sum()

In [143]:
print(f'Existem {coke} pedidos que tem Coke')

Existem 257 pedidos que tem Coke


## quantos pratos tem Guacamole ?

In [158]:
order.choice_description.str.contains('guacamole').sum()

0

In [159]:
order.choice_description.str.contains('Guacamole').sum()

1037

In [164]:
order.choice_description.str.contains('Rice').sum()

2402