## Pandas Implementation on DataFrame - 3

In [139]:
# importing libraries

import pandas as pd

### The `.drop_duplicate` function
It is going to drop the rows from dataframe where all the values inside all cell is identical for 2 or more row or else it is going to keep this as same.

In [6]:
# importing dataset

employees = pd.read_csv('pandas datasets/employees.csv',parse_dates= ['Start Date','Last Login Time'])
employees['Senior Management'] = employees['Senior Management'].astype("bool")
employees['Gender'] = employees['Gender'].astype("category")
employees.sort_values('First Name',inplace=True)

In [7]:
# checking head of dataset

employees.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-04-28 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2021-04-28 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-04-28 14:53:00,52119,11.343,True,Client Services


In [8]:
# length of row

len(employees)

1000

In [9]:
# let's try to drop duplicates from data frame.
# the length is same because no rows have all the identical values.

len(employees.drop_duplicates())

1000

In [13]:
# now let's try to drop duplicates based on subset parameter from drop_duplicate function.

employees = employees.drop_duplicates(subset=['First Name'],keep = "first")
employees.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-04-28 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2021-04-28 01:45:00,95327,15.12,False,Distribution
300,Alan,Male,1988-06-26,2021-04-28 03:54:00,111786,3.592,True,Engineering


In [14]:
employees = employees.drop_duplicates(subset=['First Name'],keep = False)
employees.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-04-28 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2021-04-28 01:45:00,95327,15.12,False,Distribution
300,Alan,Male,1988-06-26,2021-04-28 03:54:00,111786,3.592,True,Engineering


### The `.unique()` and `.nunique()` functions
To check the dataframe or series unique values.

In [18]:
# checking Gender unique values.
print("-----Gender-----")
print(employees['Gender'].unique())
print("-----Team-------")
print(employee['Team'].unique())

-----Gender-----
['Male', 'Female', NaN]
Categories (2, object): ['Male', 'Female']
-----Team-------
['Marketing' 'Distribution' 'Engineering' 'Human Resources'
 'Client Services' 'Business Development' 'Sales' 'Product' 'Legal'
 'Finance' nan]


In [19]:
# If we want to check  number of unique values present into column we can use nunique function
# Notice that it is returning 2 whereas unique() method was returning 3 unique values this is because nunique do not consider NaN values.

employees['Gender'].nunique()

2

### The `.set_index()` and `.reset_index()` method

In [24]:
# importing dataset

james_bond = pd.read_csv('pandas datasets/jamesbond.csv')

In [21]:
# Dataset top values

james_bond.head()

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
3,Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [23]:
# if we want to make Film as our index column we can do this by using index_col parameter

james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [29]:
# suppose we do not have any parameters as index_col then in that case set_index will do this job for you.

james_bond = pd.read_csv('pandas datasets/jamesbond.csv')
james_bond.set_index(keys = 'Film')
james_bond.head(3)

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [30]:
# Let's save this by using inplace parameter

james_bond.set_index(keys = 'Film',inplace = True)
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [34]:
# Suppose if we want to revert this and we want original format of data frame then in that cases we can use reset_index.

james_bond.reset_index(inplace = True)

In [37]:
# Once we set film column as our index and if we want to use another column as index over new dataframe then in that case
# we'll lose our film index column.Let's see this in implmentation.

james_bond = pd.read_csv('pandas datasets/jamesbond.csv')
james_bond.set_index('Film',inplace = True)
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [39]:
# let's try to make year as index over this.
# Now notice the output that we lost Film column.
# to avoide this we need to first reset the index and then set with year column.

james_bond.set_index(keys = "Year").head()

Unnamed: 0_level_0,Actor,Director,Box Office,Budget,Bond Actor Salary
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1962,Sean Connery,Terence Young,448.8,7.0,0.6
1963,Sean Connery,Terence Young,543.8,12.6,1.6
1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
1965,Sean Connery,Terence Young,848.1,41.9,4.7
1967,David Niven,Ken Hughes,315.0,85.0,


### The `.loc[]` method

In [42]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [49]:
# LEt's sort the dataframe using sort_index method

james_bond.sort_index(inplace=True)
james_bond.head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9


In [48]:
# we'll use loc to extract values from row as series

james_bond.loc['A View to a Kill']

Year                        1985
Actor                Roger Moore
Director               John Glen
Box Office                 275.2
Budget                      54.5
Bond Actor Salary            9.1
Name: A View to a Kill, dtype: object

In [50]:
# If we want to select values from A View to a kill to Casino Royale

james_bond.loc['A View to a Kill':'Casino Royale']

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [51]:
james_bond.loc['A View to a Kill':'Casino Royale':2]

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [52]:
# IF we want to extract from casino royale till last we can use

james_bond.loc['Casino Royale':]

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9


In [53]:
# IF we want to extract from beginning till Spectre

james_bond.loc[:'Spectre']

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


### The `iloc[]` accessor
Extract the row values by index position

In [70]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv')
james_bond.head(3)

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
0,Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
1,From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
2,Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [71]:
james_bond.iloc[0]

Film                        Dr. No
Year                          1962
Actor                 Sean Connery
Director             Terence Young
Box Office                   448.8
Budget                         7.0
Bond Actor Salary              0.6
Name: 0, dtype: object

In [72]:
james_bond.iloc[15]

Film                 A View to a Kill
Year                             1985
Actor                     Roger Moore
Director                    John Glen
Box Office                      275.2
Budget                           54.5
Bond Actor Salary                 9.1
Name: 15, dtype: object

In [73]:
# access all rows from 4 - 8
# while working with numeric the upper limit doesn't include into result.
# notice that the output return till 7 however the index position is given 8 as upper limit.
# But this was not the case in .loc[] accessor where the upper limit was included.

james_bond.iloc[4:8]

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
4,Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
5,You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4
6,On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
7,Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8


In [74]:
# 20 till end

james_bond.iloc[20:]

Unnamed: 0,Film,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
20,The World Is Not Enough,1999,Pierce Brosnan,Michael Apted,439.5,158.3,13.5
21,Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
22,Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
23,Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
24,Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
25,Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,


### The second argument of `loc` and `iloc` method

In [80]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')

In [81]:
james_bond.loc["Moonraker",'Actor']

'Roger Moore'

In [82]:
james_bond.loc["Moonraker",'Director']

'Lewis Gilbert'

In [84]:
james_bond.loc["Moonraker",['Director','Box Office']]

Director      Lewis Gilbert
Box Office            535.0
Name: Moonraker, dtype: object

In [86]:
# iloc

james_bond.iloc[14,2]

'John Glen'

In [87]:
james_bond.iloc[:7,:2]

Unnamed: 0_level_0,Year,Actor
Film,Unnamed: 1_level_1,Unnamed: 2_level_1
Dr. No,1962,Sean Connery
From Russia with Love,1963,Sean Connery
Goldfinger,1964,Sean Connery
Thunderball,1965,Sean Connery
Casino Royale,1967,David Niven
You Only Live Twice,1967,Sean Connery
On Her Majesty's Secret Service,1969,George Lazenby


### Set new value to specific cell

In [90]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.sort_index()
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [91]:
# Let's change the value of Actor Goldfinger

james_bond.loc['Goldfinger','Actor'] = "Legend Sean Connery"

In [92]:
# now let's extract the value

james_bond.loc['Goldfinger','Actor'] 

'Legend Sean Connery'

### Set multiple values in dataframe

In [93]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.sort_index()
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [95]:
# if we want to set all the values of Sean Connery to Legend Sean Connery at once we can use boolean way to achieve this.

mask = james_bond['Actor'] == 'Sean Connery'

In [98]:
# assigning the value

james_bond.loc[mask,'Actor'] = "Legend Sean Connery"

In [99]:
# return output

james_bond.head()

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Legend Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Legend Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Legend Sean Connery,Guy Hamilton,820.4,18.6,3.2
Thunderball,1965,Legend Sean Connery,Terence Young,848.1,41.9,4.7
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


### `Rename index` label or `column` in dataframe

In [100]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.sort_index()
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [104]:
# Renaming index

james_bond.rename(mapper={'GoldenEye':'Golden Eye'})
james_bond.rename(mapper={'GoldenEye':'Golden Eye'},axis = 0)
james_bond.rename(mapper={'GoldenEye':'Golden Eye'},axis = 'index')

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,


In [107]:
# Renaming column label

james_bond.rename(mapper={"Box Office":"Revenue"},axis = 1).head(3)

Unnamed: 0_level_0,Year,Actor,Director,Revenue,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [109]:
# extract column names

james_bond.columns = ['Year', 'Actor', 'Director', 'Revenue', 'Money','Salary']

In [110]:
# recheck the column names

james_bond.columns

Index(['Year', 'Actor', 'Director', 'Revenue', 'Money', 'Salary'], dtype='object')

### `Delete` `rows` and `column` from dataframe

In [111]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.sort_index()
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [115]:
# Let's delete values from row
# we can see that Goldfinger row is missing

james_bond.drop('Goldfinger').head(5)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4


In [116]:
# drop multiple rows at a time

james_bond.drop(['Thunderball','Casino Royale','You Only Live Twice'])

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,
The Spy Who Loved Me,1977,Roger Moore,Lewis Gilbert,533.0,45.1,
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,


In [118]:
# Dropping column

james_bond.drop("Box Office",axis = 1).head(3)

Unnamed: 0_level_0,Year,Actor,Director,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Dr. No,1962,Sean Connery,Terence Young,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,18.6,3.2


In [120]:
# remove multiple column at once

james_bond.drop(['Box Office','Director'],axis = 1).head(3)

Unnamed: 0_level_0,Year,Actor,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dr. No,1962,Sean Connery,7.0,0.6
From Russia with Love,1963,Sean Connery,12.6,1.6
Goldfinger,1964,Sean Connery,18.6,3.2


In [123]:
# remove column using pop function.

james_bond.pop('Actor')

KeyError: 'Actor'

In [124]:
# it removed actor column

james_bond.head(3)

Unnamed: 0_level_0,Year,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Dr. No,1962,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Guy Hamilton,820.4,18.6,3.2


In [125]:
# removing column using del keyword

del james_bond['Director']
james_bond.head(3)

Unnamed: 0_level_0,Year,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dr. No,1962,448.8,7.0,0.6
From Russia with Love,1963,543.8,12.6,1.6
Goldfinger,1964,820.4,18.6,3.2


### Create `Random` Sample
If we want to drop one row at random from dataframe we use sample function.

In [126]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.sort_index()
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [127]:
# sample method

james_bond.sample()

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6


In [128]:
# sample method

james_bond.sample(n = 3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Die Another Day,2002,Pierce Brosnan,Lee Tamahori,465.4,154.2,17.9
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,


In [129]:
# to extract 25% random rows from data

james_bond.sample(frac= 0.25)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Tomorrow Never Dies,1997,Pierce Brosnan,Roger Spottiswoode,463.2,133.9,10.0
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,


In [131]:
# Extract random columns using sample

james_bond.sample(axis=1).head(3)

Unnamed: 0_level_0,Bond Actor Salary
Film,Unnamed: 1_level_1
Dr. No,0.6
From Russia with Love,1.6
Goldfinger,3.2


### use of `.nsmallest()` and `.nlargest()` method

In [132]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.sort_index()
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [133]:
# if we want to check the largest value in Box office column

james_bond.nlargest(n = 1,columns="Box Office")

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5


In [134]:
# if we want to check the smallest value in Box office column

james_bond.nsmallest(n = 1,columns="Box Office")

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9


In [136]:
james_bond['Box Office'].nlargest()

Film
Skyfall          943.5
Thunderball      848.1
Goldfinger       820.4
Spectre          726.7
Casino Royale    581.5
Name: Box Office, dtype: float64

### filtering with `where` Method

In [137]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.sort_index()
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [138]:
# it will return all values with Sean Connery rest all with NaN values

james_bond.where(james_bond['Actor'] == 'Sean Connery')

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962.0,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963.0,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964.0,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Thunderball,1965.0,Sean Connery,Terence Young,848.1,41.9,4.7
Casino Royale,,,,,,
You Only Live Twice,1967.0,Sean Connery,Lewis Gilbert,514.2,59.9,4.4
On Her Majesty's Secret Service,,,,,,
Diamonds Are Forever,1971.0,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Live and Let Die,,,,,,
The Man with the Golden Gun,,,,,,


### The `.query()` Method
> This method only works when we do not have any spaces in column name.
<br>It only accepts string values

In [140]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.sort_index()
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [142]:
# first replace spaces with underscore

james_bond.columns = [column_name.replace(" ","_") for column_name in james_bond.columns]

In [143]:
# recheck head of dataset

james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [144]:
# let's fetch rows with actor Sean connery

james_bond.query('Actor == "Sean Connery"')

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,


In [145]:
# let's queary for not equal to 

james_bond.query("Actor != 'Sean Connery'")

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,
The Spy Who Loved Me,1977,Roger Moore,Lewis Gilbert,533.0,45.1,
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
Octopussy,1983,Roger Moore,John Glen,373.8,53.9,7.8
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
The Living Daylights,1987,Timothy Dalton,John Glen,313.5,68.8,5.2


In [146]:
# for greater than equal to

james_bond.query("Box_Office > 600")

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5
Spectre,2015,Daniel Craig,Sam Mendes,726.7,206.3,


In [148]:
# using multiple conditions

james_bond.query("Actor == 'Sean Connery' and Director == 'Terence Young'")

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7


In [150]:
# using in keyword

james_bond.query("Actor in ['Sean Connery','Daniel Craig']")

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Never Say Never Again,1983,Sean Connery,Irvin Kershner,380.0,86.0,
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Quantum of Solace,2008,Daniel Craig,Marc Forster,514.2,181.4,8.1
Skyfall,2012,Daniel Craig,Sam Mendes,943.5,170.2,14.5


In [151]:
# using not in keyword

james_bond.query("Actor not in ['Sean Connery','Daniel Craig']")

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
On Her Majesty's Secret Service,1969,George Lazenby,Peter R. Hunt,291.5,37.3,0.6
Live and Let Die,1973,Roger Moore,Guy Hamilton,460.3,30.8,
The Man with the Golden Gun,1974,Roger Moore,Guy Hamilton,334.0,27.7,
The Spy Who Loved Me,1977,Roger Moore,Lewis Gilbert,533.0,45.1,
Moonraker,1979,Roger Moore,Lewis Gilbert,535.0,91.5,
For Your Eyes Only,1981,Roger Moore,John Glen,449.4,60.2,
Octopussy,1983,Roger Moore,John Glen,373.8,53.9,7.8
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
The Living Daylights,1987,Timothy Dalton,John Glen,313.5,68.8,5.2


### `.apply()` method

In [155]:
james_bond = pd.read_csv('pandas datasets/jamesbond.csv',index_col = 'Film')
james_bond.sort_index()
james_bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6
Goldfinger,1964,Sean Connery,Guy Hamilton,820.4,18.6,3.2


In [158]:
def million(number):
    return str(number) + " Million"

james_bond['Box Office'].apply(million)

Film
Dr. No                             448.8 Million
From Russia with Love              543.8 Million
Goldfinger                         820.4 Million
Thunderball                        848.1 Million
Casino Royale                      315.0 Million
You Only Live Twice                514.2 Million
On Her Majesty's Secret Service    291.5 Million
Diamonds Are Forever               442.5 Million
Live and Let Die                   460.3 Million
The Man with the Golden Gun        334.0 Million
The Spy Who Loved Me               533.0 Million
Moonraker                          535.0 Million
For Your Eyes Only                 449.4 Million
Never Say Never Again              380.0 Million
Octopussy                          373.8 Million
A View to a Kill                   275.2 Million
The Living Daylights               313.5 Million
Licence to Kill                    250.9 Million
GoldenEye                          518.5 Million
Tomorrow Never Dies                463.2 Million
The World Is No