In [1]:
import os
import pandas as pd
import jupyter_black

jupyter_black.load()

current_dir = os.path.dirname(os.path.abspath("__file__"))

### Set file path to datasets

In [41]:
pokemon_path = os.path.join(current_dir, "data", "pokemon.csv")
google_path = os.path.join(current_dir, "data", "google_stock_price.csv")
nba_path = os.path.join(current_dir, "data", "nba.csv")
revenue_path = os.path.join(current_dir, "data", "revenue.csv")
bond_path = os.path.join(current_dir, "data", "jamesbond.csv")
chicago_path = os.path.join(current_dir, "data", "chicago.csv")
salesmen_path = os.path.join(current_dir, "data", "salesmen.csv")
foods_path = os.path.join(current_dir, "data", "foods.csv")

#### Go back and forth between `pd.Series` and `pd.DataFrame` with `to_frame()` and `squeeze()`

In [3]:
fruits = ["Apple", "Orange", "Banana", "Pear"]
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday"]
s = pd.Series(index=weekdays, data=fruits)
display(s)
df = s.to_frame()
display(df)
s = df.squeeze()
display(s)

Monday        Apple
Tuesday      Orange
Wednesday    Banana
Thursday       Pear
dtype: object

Unnamed: 0,0
Monday,Apple
Tuesday,Orange
Wednesday,Banana
Thursday,Pear


Monday        Apple
Tuesday      Orange
Wednesday    Banana
Thursday       Pear
Name: 0, dtype: object

### The `apply()` Method

In [4]:
pokemon = pd.read_csv(pokemon_path, index_col="Pokemon").squeeze()


def rank_pokemon(pokemon_type):
    if pokemon_type in ["Grass", "Fire", "Water"]:
        return "Classic"
    elif pokemon_type == "Normal":
        return "Boring"
    return "TBD"


pokemon.apply(rank_pokemon).head(3)

Pokemon
Bulbasaur    Classic
Ivysaur      Classic
Venusaur     Classic
Name: Type, dtype: object

### Insert new column at a specific position

In [5]:
nba = pd.read_csv(nba_path)
nba.insert(3, "new_col", "New column at position 3")
nba.head(3)

Unnamed: 0,Name,Team,Number,new_col,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,New column at position 3,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,New column at position 3,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,New column at position 3,SG,27.0,6-5,205.0,Boston University,


### Apply aggregation function accross the different axis

In [14]:
revenue = pd.read_csv(revenue_path, index_col=["Date"])

# The following give identical result, and sum ACCROSS rows
revenue.sum()
revenue.sum(axis=0)
# The following sum ACCROSS columns
revenue.sum(axis=1).head(3)

Date
1/1/16    1606
1/2/16    2060
1/3/16     967
dtype: int64

Note that `df[]` creates a copy, while `df.loc[]` references the existing DataFrame. <br>
`df.loc[]` must therefore be used when changing the values in a DataFrame

### Use the `nsmallest` / `nlargest` methods

In [19]:
bond = pd.read_csv(bond_path, index_col="Film")
# nlargest/nsmallest are faster than sort_values
bond.nlargest(3, columns="Box Office")
bond.nsmallest(2, columns="Box Office")

Unnamed: 0_level_0,Year,Actor,Director,Box Office,Budget,Bond Actor Salary
Film,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Licence to Kill,1989,Timothy Dalton,John Glen,250.9,56.7,7.9
A View to a Kill,1985,Roger Moore,John Glen,275.2,54.5,9.1


### The `get()` Method 

In [28]:
chicago = pd.read_csv(chicago_path)
chicago["Last Name"] = chicago["Name"].str.split(",").str.get(0).str.strip()
chicago["First Name"] = chicago["Name"].str.split(",").str.get(1).str.strip()
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,Last Name,First Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA


Another way to achieve similar to above is to use `.split()` with `expand=True`

In [31]:
chicago = pd.read_csv(chicago_path)
chicago["Name"].str.split(",", expand=True).rename(
    columns={0: "last Name", 1: "First Name"}
).head(3)

Unnamed: 0,last Name,First Name
0,AARON,ELVIA J
1,AARON,JEFFERY M
2,AARON,KARINA


### The `pivot()` method

Docstring:
Return reshaped DataFrame organized by given index / column values.

Reshape data (produce a "pivot" table) based on column values. Uses
unique values from specified `index` / `columns` to form axes of the
resulting DataFrame. This function does not support data
aggregation, multiple values will result in a MultiIndex in the
columns. See the :ref:`User Guide <reshaping>` for more on reshaping.

In [50]:
sales = pd.read_csv(salesmen_path, parse_dates=["Date"])
sales["Salesman"] = sales["Salesman"].astype("category")
display(sales.head(3))
sales = sales.pivot(index="Date", columns="Salesman", values="Revenue")
sales.head(3)

Unnamed: 0,Date,Salesman,Revenue
0,2016-01-01,Bob,7172
1,2016-01-02,Bob,6362
2,2016-01-03,Bob,5982


Salesman,Bob,Dave,Jeb,Oscar,Ronald
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-01,7172,1864,4430,5250,2639
2016-01-02,6362,8278,8026,8661,4951
2016-01-03,5982,4226,5188,7075,2703


### The `pivot_table()` Method

The `pivot_table()` Method works as the pivot method in Excel

In [47]:
foods = pd.read_csv(foods_path)
display(foods.head())
foods.pivot_table(values="Spend", index="Gender", aggfunc="mean")
foods.pivot_table(values="Spend", index="Gender", columns="City", aggfunc="mean")

Unnamed: 0,First Name,Gender,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15.66
1,Eric,Male,Stamford,Daily,Chalupa,10.56
2,Charles,Male,New York,Never,Sushi,42.14
3,Anna,Female,Philadelphia,Once,Ice Cream,11.01
4,Deborah,Female,Philadelphia,Daily,Chalupa,23.49


City,New York,Philadelphia,Stamford
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,48.666194,52.63765,50.502184
Male,52.318418,46.60142,49.596623
