# Introduction to Pandas

In [1]:
!pip3 install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m508.0/508.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.2 pytz-

In [2]:
import numpy as np
import pandas as pd

### The Pandas Objects

1. **Series**: 1D array of indexed data
2. **DataFrame**: 2D array of indexed data, which operates very similar as spreadsheet table.
3. **Index**: Property of Pandas series and dataframe object, which is immutable _ordered_ set

## The Pandas Series

In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
# index object of the series "data"
data.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# get all values in the series as ndarray
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [6]:
list(data.items())

[(0, 0.25), (1, 0.5), (2, 0.75), (3, 1.0)]

### Indexing, Slicing

In [7]:
# Series works pretty much very similar to that of 1D ndarray
print("data[1]:",data[1])
print("data[1:3]:\n", data[1:3])

data[1]: 0.5
data[1:3]:
 1    0.50
2    0.75
dtype: float64


In [8]:
# To check whether the index exists in the series
print("Does index == 4 exists?", 4 in data)
print("Does index == 3 exists?", 3 in data)
print("Does 1.25 exists in values?", 1.25 in data.values)
print("Does 0.25 exists in values?", 0.25 in data.values)

Does index == 4 exists? False
Does index == 3 exists? True
Does 1.25 exists in values? False
Does 0.25 exists in values? True


### Customizing the Index Object of Series

In [9]:
# index object of the series needs not to be numeric
inning_data = pd.Series([0,0,3,4], index=['Inning 1', 'Inning 2', 'Inning 3', 'Inning 4'])
inning_data

Inning 1    0
Inning 2    0
Inning 3    3
Inning 4    4
dtype: int64

In [10]:
inning_data.index

Index(['Inning 1', 'Inning 2', 'Inning 3', 'Inning 4'], dtype='object')

### Creating Series from Dictionary

In [11]:
fruits = {
  0: "Apple",
  1: "Banana",
  2: "Kiwi",
  3: "Watermelon"
}
fruits_sr = pd.Series(fruits)
fruits_sr

0         Apple
1        Banana
2          Kiwi
3    Watermelon
dtype: object

In [12]:
groceries = {
  "f1": "Apple",
  "f2": "Banana",
  "f3": "Kiwi",
  "f4": "Watermelon",
  "v1": "Cabbage",
  "v2": "Carrot",
  "v3": "Mushroom"
}

groceries_keys = list(groceries.keys())
fruit_keys = [k for k in groceries_keys if k.startswith("f")]
fruit_keys

groceries_sr_fruit = pd.Series(groceries, index=fruit_keys)

In [13]:
groceries_sr_fruit

f1         Apple
f2        Banana
f3          Kiwi
f4    Watermelon
dtype: object

In [14]:
veg_keys= [k for k in groceries_keys if k.startswith("v")]
groceries_sr_veg = pd.Series(groceries, index=veg_keys)

In [15]:
groceries_sr_veg

v1     Cabbage
v2      Carrot
v3    Mushroom
dtype: object

## The Pandas DataFrame Object

In [16]:
student_score_dict = {"John": 90, "Sam": 82, "Alice": 95, "Bob": 73, "Cathy": 59}
student_class_dict = {"John": "A", "Sam": "A", "Alice": "B", "Bob": "C", "Cathy": "C"}

In [17]:
student_score_sr = pd.Series(student_score_dict)
student_score_sr

John     90
Sam      82
Alice    95
Bob      73
Cathy    59
dtype: int64

In [18]:
student_class_sr = pd.Series(student_class_dict)
student_class_sr

John     A
Sam      A
Alice    B
Bob      C
Cathy    C
dtype: object

### Creating DataFrame from Multiple Series

In [19]:
students_df = pd.DataFrame({"class": student_class_sr, "score": student_score_sr})
students_df

Unnamed: 0,class,score
John,A,90
Sam,A,82
Alice,B,95
Bob,C,73
Cathy,C,59


### Indexing

In [20]:
# Indexing by column name
class_data = students_df["class"]
class_data

John     A
Sam      A
Alice    B
Bob      C
Cathy    C
Name: class, dtype: object

In [21]:
# Indexing by index using loc (location) (i.e. student name in this example)
john_data = students_df.loc["John"]
john_data

class     A
score    90
Name: John, dtype: object

In [22]:
print("John's Class:", john_data["class"])
print("John's Score:", john_data["score"])

John's Class: A
John's Score: 90


In [23]:
# Indexing by order of index using iloc (index location) (nth student)
second_data = students_df.iloc[1]
second_data

class     A
score    82
Name: Sam, dtype: object

### Practice: Analyzing Data with Pandas out of .csv File

`.csv`?
- `C`oma `S`eparated `V`alues

In [25]:
state_area_df = pd.read_csv("../data/state-areas.csv")
# index: 0~51
# column: ["state", "area (sq. mi)"]
state_area_df

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707
5,Colorado,104100
6,Connecticut,5544
7,Delaware,1954
8,Florida,65758
9,Georgia,59441


### How to find the Area of Wisconsin?

`Method 1`: Stupid Way!

In [33]:
state_list = state_area_df["state"].values.tolist()
state_list

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming',
 'District of Columbia',
 'Puerto Rico']

In [45]:
# get index of wisconsin
wisconsin_index = state_list.index("Wisconsin")
print("The index of Wisconsin in the DataFrame is:", wisconsin_index)

The index of Wisconsin in the DataFrame is: 48


In [46]:
# finally, get the area of Wisconsin
print("The area of Wisconsin is:", state_area_df.loc[wisconsin_index].loc["area (sq. mi)"])

The area of Wisconsin is: 65503


`Method 2`: Using Series

In [58]:
state_area_df = pd.read_csv("../data/state-areas.csv")

In [61]:
# get states and area as separate array
states_arr = state_area_df["state"].values
area_arr = state_area_df["area (sq. mi)"].values

In [62]:
state_area_sr = pd.Series(area_arr, index=states_arr)
state_area_sr

Alabama                  52423
Alaska                  656425
Arizona                 114006
Arkansas                 53182
California              163707
Colorado                104100
Connecticut               5544
Delaware                  1954
Florida                  65758
Georgia                  59441
Hawaii                   10932
Idaho                    83574
Illinois                 57918
Indiana                  36420
Iowa                     56276
Kansas                   82282
Kentucky                 40411
Louisiana                51843
Maine                    35387
Maryland                 12407
Massachusetts            10555
Michigan                 96810
Minnesota                86943
Mississippi              48434
Missouri                 69709
Montana                 147046
Nebraska                 77358
Nevada                  110567
New Hampshire             9351
New Jersey                8722
New Mexico              121593
New York                 54475
North Ca

In [64]:
print("The area of Wisconsin is:", state_area_sr["Wisconsin"])

The area of Wisconsin is: 65503


`Method 3`: Using DataFrame

In [90]:
state_area_df = pd.read_csv("../data/state-areas.csv")
state_area_df

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707
5,Colorado,104100
6,Connecticut,5544
7,Delaware,1954
8,Florida,65758
9,Georgia,59441


In [91]:
# overwrite state_area_df with new DataFrame, where index is state, and values are area (sq. mi).
state_area_df_2 = pd.DataFrame({"area (sq. mi)": state_area_df["area (sq. mi)"].values}, index=state_area_df["state"])
state_area_df_2

Unnamed: 0_level_0,area (sq. mi)
state,Unnamed: 1_level_1
Alabama,52423
Alaska,656425
Arizona,114006
Arkansas,53182
California,163707
Colorado,104100
Connecticut,5544
Delaware,1954
Florida,65758
Georgia,59441


In [92]:
print("The area of Wisconsin is:", state_area_df_2.loc["Wisconsin"].loc["area (sq. mi)"])

The area of Wisconsin is: 65503


`Method 4`: Taking full advantage of Pandas with `set_index` function of pandas DataFrame Object

In [67]:
state_area_df = pd.read_csv("../data/state-areas.csv")
state_area_df = state_area_df.set_index("state")
# index: ["state"]
# column: ["area (sq. mi)"]
state_area_df

Unnamed: 0_level_0,area (sq. mi)
state,Unnamed: 1_level_1
Alabama,52423
Alaska,656425
Arizona,114006
Arkansas,53182
California,163707
Colorado,104100
Connecticut,5544
Delaware,1954
Florida,65758
Georgia,59441


In [68]:
print("The area of Wisconsin is:", state_area_df.loc["Wisconsin"].loc["area (sq. mi)"])

The area of Wisconsin is: 65503
