# Introduction to Pandas

In [39]:
!pip3 install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [40]:
import numpy as np
import pandas as pd

### The Pandas Objects

1. **Series**: 1D array of indexed data
2. **DataFrame**: 2D array of indexed data, which operates very similar as spreadsheet table.
3. **Index**: Property of Pandas series and dataframe object, which is immutable _ordered_ set

## The Pandas Series

In [41]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [44]:
# index object of the series "data"
data.index

RangeIndex(start=0, stop=4, step=1)

In [45]:
# get all values in the series as ndarray
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [47]:
list(data.items())

[(0, 0.25), (1, 0.5), (2, 0.75), (3, 1.0)]

### Indexing, Slicing

In [6]:
# Series works pretty much very similar to that of 1D ndarray
print("data[1]:",data[1])
print("data[1:3]:\n", data[1:3])

data[1]: 0.5
data[1:3]:
 1    0.50
2    0.75
dtype: float64


In [53]:
# To check whether the index exists in the series
print("Does index == 4 exists?", 4 in data)
print("Does index == 3 exists?", 3 in data)
print("Does 1.25 exists in values?", 1.25 in data.values)
print("Does 0.25 exists in values?", 0.25 in data.values)

Does index == 4 exists? False
Does index == 3 exists? True
Does 1.25 exists in values? False
Does 0.25 exists in values? True


### Customizing the Index Object of Series

In [7]:
# index object of the series needs not to be numeric
inning_data = pd.Series([0,0,3,4], index=['Inning 1', 'Inning 2', 'Inning 3', 'Inning 4'])
inning_data

Inning 1    0
Inning 2    0
Inning 3    3
Inning 4    4
dtype: int64

In [8]:
inning_data.index

Index(['Inning 1', 'Inning 2', 'Inning 3', 'Inning 4'], dtype='object')

### Creating Series from Dictionary

In [10]:
fruits = {
  0: "Apple",
  1: "Banana",
  2: "Kiwi",
  3: "Watermelon"
}
fruits_sr = pd.Series(fruits)
fruits_sr

0         Apple
1        Banana
2          Kiwi
3    Watermelon
dtype: object

In [18]:
groceries = {
  "f1": "Apple",
  "f2": "Banana",
  "f3": "Kiwi",
  "f4": "Watermelon",
  "v1": "Cabbage",
  "v2": "Carrot",
  "v3": "Mushroom"
}

groceries_keys = list(groceries.keys())
fruit_keys = [k for k in groceries_keys if k.startswith("f")]
fruit_keys

groceries_sr_fruit = pd.Series(groceries, index=fruit_keys)

In [19]:
groceries_sr_fruit

f1         Apple
f2        Banana
f3          Kiwi
f4    Watermelon
dtype: object

In [20]:
veg_keys= [k for k in groceries_keys if k.startswith("v")]
groceries_sr_veg = pd.Series(groceries, index=veg_keys)

In [21]:
groceries_sr_veg

v1     Cabbage
v2      Carrot
v3    Mushroom
dtype: object

## The Pandas DataFrame Object

In [26]:
student_score_dict = {"John": 90, "Sam": 82, "Alice": 95, "Bob": 73, "Cathy": 59}
student_class_dict = {"John": "A", "Sam": "A", "Alice": "B", "Bob": "C", "Cathy": "C"}

In [27]:
student_score_sr = pd.Series(student_score_dict)
student_score_sr

John     90
Sam      82
Alice    95
Bob      73
Cathy    59
dtype: int64

In [28]:
student_class_sr = pd.Series(student_class_dict)
student_class_sr

John     A
Sam      A
Alice    B
Bob      C
Cathy    C
dtype: object

### Creating DataFrame from Multiple Series

In [29]:
students_df = pd.DataFrame({"class": student_class_sr, "score": student_score_sr})
students_df

Unnamed: 0,class,score
John,A,90
Sam,A,82
Alice,B,95
Bob,C,73
Cathy,C,59


### Indexing

In [35]:
# Indexing by column name
class_data = students_df["class"]
class_data

John     A
Sam      A
Alice    B
Bob      C
Cathy    C
Name: class, dtype: object

In [36]:
# Indexing by index using loc (location) (i.e. student name in this example)
john_data = students_df.loc["John"]
john_data

class     A
score    90
Name: John, dtype: object

In [38]:
print("John's Class:", john_data["class"])
print("John's Score:", john_data["score"])

John's Class: A
John's Score: 90


In [55]:
# Indexing by order of index using iloc (index location) (nth student)
second_data = students_df.iloc[1]
second_data

class     A
score    82
Name: Sam, dtype: object