# Data Extraction

In [1]:
%load_ext lab_black
import pandas as pd

file_path = "/Users/fredrikjohannessen/Desktop/repositories/pandas-payground/data/"

### The `set_index()` and `get_index()` methods

In [2]:
# Set index as we are reading csv
df = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")

# Set index after reading the csv
df = pd.read_csv(file_path + "jamesbond.csv")
df = df.set_index(keys=["Film"])

# Reset Index
df = df.reset_index()
df.reset_index(drop=True)

# Change index from Film to Year
df = df.set_index(keys=["Film"])
df = df.reset_index()
df = df.set_index(keys=["Year"])
df.head(3)

Unnamed: 0_level_0,Film,Actor,Director,Box Office,Budget,Bond Actor Salary
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962,Dr. No,Sean Connery,Terence Young,448.8,7.0,0.6
1963,From Russia with Love,Sean Connery,Terence Young,543.8,12.6,1.6
1964,Goldfinger,Sean Connery,Guy Hamilton,820.4,18.6,3.2


### Retrive rows by their index labels using `.loc[]` Accessor

In [3]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_index()
bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [4]:
bond.loc[["Casino Royale", "Goldfinger", "GoldenEye"]]
bond.loc[["GoldenEye", "Casino Royale", "Casino Royale"]]

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GoldenEye,1995,Pierce Brosnan,Martin Campbell,518.5,76.9,5.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


In [5]:
# List Slicing syntax
bond.loc["Diamonds Are Forever":"From Russia with Love"]  # Endpoint is included
bond.loc["Diamonds Are Forever":"From Russia with Love":2]
bond.loc["Diamonds Are Forever":]
bond.loc[:"Diamonds Are Forever"]

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8


In [6]:
# Check if labelled index exists in dataframe
"Diamonds Are Forever" in bond.index
"Rocks Are Forever" in bond.index

False

### Retrive Rows by Index Position with `iloc` Accessor

In [7]:
bond.iloc[23]
bond.iloc[23:]
bond.iloc[23:25]  # Endpoint is excluded
bond.iloc[[25, 23]]

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
You Only Live Twice,1967,Sean Connery,Lewis Gilbert,514.2,59.9,4.4
Thunderball,1965,Sean Connery,Terence Young,848.1,41.9,4.7


### Second arguments to `loc` and `iloc`

In [8]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_index()
bond.loc["Moonraker", "Actor"]
bond.loc["Moonraker", ["Actor", "Year"]]
bond.loc[["Moonraker", "A View to a Kill"], ["Actor", "Year"]]
bond.loc[["Moonraker", "A View to a Kill"], "Box Office":]

bond.iloc[14, 1]
bond.iloc[14, 2:]

Director             John Glen
Box Office               373.8
Budget                    53.9
Bond Actor Salary          7.8
Name: Octopussy, dtype: object

### Set new value for a specific cell

In [13]:
bond.loc["Dr. No", "Actor"] = "Sir Sean Connery"
bond.loc["Dr. No", ["Box Office", "Budget", "Bond Actor Salary"]] = [1, 2, 3]

### Set Multiple Values in a DataFrame

Note that `df[]` creates a copy, while `df.loc[]` references the existing DataFrame. <br>
`df.loc[]` must therefore be used when changing the values in a DataFrame

In [31]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_index()
bond.head(3)

actor_is_sean_connery = bond["Actor"] == "Sean Connery"

In [27]:
# Wrong way to do it, because it creates a copy
bond[actor_is_sean_connery]["Actor"] = "Sir Sean Connery"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bond[actor_is_sean_connery]["Actor"] = "Sir Sean Connery"


In [28]:
# Right way to do it
bond.loc[actor_is_sean_connery, "Actor"] = "Sir Sean Connery"

### Rename Index Labels or Columns in a `DataFrame`

In [69]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_index()
bond.head(3)

# Renaming Index
# The following two do the same thing
bond.rename(
    mapper={
        "GoldenEye": "Golden Eye",
        "The World Is Not Enough": "Best Bond Movie Ever",
    },
    axis=0,  # Can also use axis="rows",  or axis="index",
)

bond.rename(
    index={
        "GoldenEye": "Golden Eye",
        "The World Is Not Enough": "Best Bond Movie Ever",
    },
)

# Rename Columns
# The following do the same thing
bond.rename(mapper={"Year": "Release Date", "Box Office": "Revenue"}, axis=1)
bond.rename(mapper={"Year": "Release Date", "Box Office": "Revenue"}, axis="columns")
bond.rename(columns={"Year": "Release Date", "Box Office": "Revenue"})

# Renaming all columns
col_names = ["Year", "Actor", "Director", "Box Office", "Budget", "Bond Actor Salary"]
col_names = [x.lower() for x in col_names]
bond.columns = col_names
bond.index.name = "film"
bond.head(3)

Unnamed: 0_level_0,year,actor,director,box office,budget,bond actor salary
film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5,145.3,3.3
Casino Royale,1967,David Niven,Ken Hughes,315.0,85.0,


### Delete Rows or Columns from a DataFrame

In [78]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_index()
bond.head(3)

# Remove rows by index label
bond.drop("A View to a Kill")
bond.drop(["A View to a Kill", "Die Another Day", "Casino Royale"])

# Remove columns
bond.drop("Box Office", axis=1)
bond.drop(["Box Office", "Budget"], axis=1)

# pop method
bond.pop("Actor")  # Removes and returns the "Actor" column

# The del keyword
del bond["Director"]

bond.head(3)

Unnamed: 0_level_0,Year,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A View to a Kill,1985,275.2,54.5,9.1
Casino Royale,2006,581.5,145.3,3.3
Casino Royale,1967,315.0,85.0,


### Create Random Sample with the `sample` method

In [83]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_index()
bond.sample(3)
bond.sample(frac=0.25)  # Sample 25% of the rows
bond.sample(n=3, axis=1).head(3)

Unnamed: 0_level_0,Year,Director,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A View to a Kill,1985,John Glen,9.1
Casino Royale,2006,Martin Campbell,3.3
Casino Royale,1967,Ken Hughes,


### Use the `nsmallest` / `nlargest` methods

In [89]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_values("Box Office", ascending=False)
# nlargest/nsmallest are faster than sort_values
bond.nlargest(3, columns="Box Office")
bond.nsmallest(2, columns="Box Office")

bond["Box Office"].nlargest(2)

Film
Skyfall        943.5
Thunderball    848.1
Name: Box Office, dtype: float64

### Filter A `DataFrame` with the `where` method

In [94]:
# The way we previously learned to filter
mask = bond["Actor"] == "Sean Connery"
bond[mask]

# With the where method, returns the full dataframe but those that do not fit the mask are set to NaN:
bond.where(mask).head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Skyfall,,,,,,
Thunderball,1965.0,Sean Connery,Terence Young,848.1,41.9,4.7
Goldfinger,1964.0,Sean Connery,Guy Hamilton,820.4,18.6,3.2


### Filter A DataFrame with the `query` method

In [108]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_index()
# Remove spaces from column names
bond.columns = [x.replace(" ", "_") for x in bond.columns]

bond.query('Actor == "Sean Connery"')
bond.query('Actor == "Roger Moore" and Director == "John Glen"')
bond.query('Actor in ["Timothy Dalton", "Sean Connery"]').head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box_Office,Budget,Bond_Actor_Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Diamonds Are Forever,1971,Sean Connery,Guy Hamilton,442.5,34.7,5.8
Dr. No,1962,Sean Connery,Terence Young,448.8,7.0,0.6
From Russia with Love,1963,Sean Connery,Terence Young,543.8,12.6,1.6


### A Review of the `apply()` Method on Single Columns

In [115]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_index()


def convert_to_string_and_add_millions(number):
    return str(number) + " MILLIONS!"


bond["Box Office"].apply(convert_to_string_and_add_millions).head(3)


columns = ["Box Office", "Budget", "Bond Actor Salary"]

for col in columns:
    bond[col] = bond[col].apply(convert_to_string_and_add_millions)

bond.head(3)

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A View to a Kill,1985,Roger Moore,John Glen,275.2 MILLIONS!,54.5 MILLIONS!,9.1 MILLIONS!
Casino Royale,2006,Daniel Craig,Martin Campbell,581.5 MILLIONS!,145.3 MILLIONS!,3.3 MILLIONS!
Casino Royale,1967,David Niven,Ken Hughes,315.0 MILLIONS!,85.0 MILLIONS!,nan MILLIONS!


### Apply a Function to every DataFrame Row with the apply function

In [131]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_index()


def good_movie(row):
    actor = row["Actor"]
    budget = row["Budget"]
    if actor == "Pierce Brosnan":
        return "The Best"
    elif actor == "Roger Moore" and budget > 40:
        return "Okay"
    else:
        return "No clue"


bond.apply(good_movie, axis=1).head(3)

Film
A View to a Kill       Okay
Casino Royale       No clue
Casino Royale       No clue
dtype: object

### The `.copy()` method

In [138]:
bond = pd.read_csv(file_path + "jamesbond.csv", index_col="Film")
bond = bond.sort_index()

# This will give a warning, as we make change to a slice of a DataFrame. The Original DataFrame will also be changed
directors = bond["Director"]
directors[0] = "Fredrik"

# Use copy to not affect the original DataFrame
directors = bond["Director"].copy()
directors[1] = "Fredrik"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  directors[0] = "Fredrik"
