# Pandas


Pandas is a data manipulation library in python, used for data analysis and cleaning.

It provides two primary structures:

- **Series**: one-dimension array like object
- **DataFrames**: two-dimensional, size-mutable tabular data structure


## Install packages


In [64]:
!uv pip install -q \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.0


## Import packages


In [None]:
from datetime import datetime

import numpy as np
import pandas as pd
from IPython.display import display

## Usage


### Series


#### Create series from List


In [None]:
data1 = [1, 2, 3, 4, 5]
series1 = pd.Series(data1)
print(f"Series:\n{series1}")
print(type(series1))

Series:
0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>


#### Create Series from Dict


In [None]:
data2 = {"a": 1, "b": 2}
series2 = pd.Series(data2)
print(f"Series:\n{series2}")

Series:
a    1
b    2
dtype: int64


#### Working with Series indexes


In [None]:
data3 = [10, 20, 30]
index3 = ["a", "b", "c"]
series3 = pd.Series(data3, index=index3)
print(f"Series:\n{series3}")

Series:
a    10
b    20
c    30
dtype: int64


### Data frames


#### Loading Data


##### Data frame from a dictionary of lists


In [None]:
data4 = {
    "Name": ["Pedro", "James", "John"],
    "Age": [33, 27, 52],
}

dataframe4 = pd.DataFrame(data4)
dataframe4

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27
2,John,52


##### Data frame from a list of dicts


In [None]:
data5 = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe5 = pd.DataFrame(data5)
dataframe5

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27
2,John,52


#### Dataframe attributes


In [None]:
dataframe5.dtypes

Name    object
Age      int64
dtype: object

#### Dataframe methods


In [None]:
data6 = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe6 = pd.DataFrame(data6)
dataframe6.head(2)

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27


In [None]:
dataframe6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes


In [None]:
dataframe6.describe()

Unnamed: 0,Age
count,3.0
mean,37.333333
std,13.051181
min,27.0
25%,30.0
50%,33.0
75%,42.5
max,52.0


In [None]:
dataframe6.tail(2)

Unnamed: 0,Name,Age
1,James,27
2,John,52


##### Renaming columns


In [None]:
dataframe6.rename({"Name": "First Name"})

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27
2,John,52


##### Accessing a Series


In [None]:
print(dataframe6["Name"])
print(type(dataframe6["Name"]))

0    Pedro
1    James
2     John
Name: Name, dtype: object
<class 'pandas.core.series.Series'>


##### Accessing an row index


In [None]:
dataframe6.loc[0]

Name    Pedro
Age        33
Name: 0, dtype: object

##### Accessing a particular value


In [None]:
dataframe6.at[1, "Age"]

np.int64(27)

In [None]:
dataframe6.iat[2, 1]

np.int64(52)

#### Dataframe manipulation


In [None]:
data7 = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe7 = pd.DataFrame(data7)
dataframe7.head(2)

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27


##### Adding a new column


In [None]:
dataframe7["City"] = ["New York", "Florida", "Los Angeles"]
dataframe7

Unnamed: 0,Name,Age,City
0,Pedro,33,New York
1,James,27,Florida
2,John,52,Los Angeles


##### Removing a column


In [None]:
dataframe7.drop("City", axis=1)  # axis=0 rows, axis=1 columns

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27
2,John,52


##### Removing a row


In [None]:
dataframe7.drop(1, axis=0)

Unnamed: 0,Name,Age,City
0,Pedro,33,New York
2,John,52,Los Angeles


##### Modify column value


In [None]:
dataframe7["Age"] = dataframe7["Age"] + 1
dataframe7

Unnamed: 0,Name,Age,City
0,Pedro,34,New York
1,James,28,Florida
2,John,53,Los Angeles


##### Handling missing values


In [None]:
data8 = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": None},
    {"Name": "John", "Age": 52},
]

dataframe8 = pd.DataFrame(data8)

dataframe8.isnull().any()

Name    False
Age      True
dtype: bool

In [None]:
dataframe8.isnull().sum()

Name    0
Age     1
dtype: int64

In [None]:
dataframe8[dataframe8.isnull().any(axis=1)]

Unnamed: 0,Name,Age
1,James,


In [None]:
dataframe8.fillna(0)

Unnamed: 0,Name,Age
0,Pedro,33.0
1,James,0.0
2,John,52.0


In [None]:
dataframe8["Age_fill_NA"] = dataframe8["Age"].fillna(dataframe8["Age"].mean())
dataframe8

Unnamed: 0,Name,Age,Age_fill_NA
0,Pedro,33.0,33.0
1,James,,42.5
2,John,52.0,52.0


#### Data manipulation


##### Casting


In [None]:
data9 = [
    {"Name": "Pedro", "Age": 33.0},
    {"Name": "James", "Age": 27.0},
    {"Name": "John", "Age": 52.0},
]

dataframe9 = pd.DataFrame(data9)

dataframe9.head()

Unnamed: 0,Name,Age
0,Pedro,33.0
1,James,27.0
2,John,52.0


Casting a DataFrame Series to integer


In [None]:
dataframe9["Age"] = dataframe9["Age"].astype(int)
dataframe9

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27
2,John,52


##### Applying function on column


Applying a lambda function to a DataFrame Series


In [None]:
current_year = datetime.now().year

dataframe9["BirthYear"] = dataframe9["Age"].apply(lambda x: current_year - x)

dataframe9.head()

Unnamed: 0,Name,Age,BirthYear
0,Pedro,33,1992
1,James,27,1998
2,John,52,1973


##### Aggregation


Make a mock dataset


In [None]:
size = 10

start_date = pd.to_datetime("2023-01-01")

data10 = {
    "order_id": np.arange(1, size + 1),
    "product": np.random.choice(
        ["Phone", "Laptop", "Keyboard", "Mouse", "Monitor"], size=size
    ),
    "region": np.random.choice(["North", "South", "East", "West"], size=size),
    "price": np.random.randint(100, 2000, size=size),
    "quantity": np.random.randint(1, 10, size=size),
    "order_date": start_date
    + pd.to_timedelta(np.random.randint(0, 365, size=size), unit="D"),
}

dataframe10 = pd.DataFrame(data10)

dataframe10.head(10)

Unnamed: 0,order_id,product,region,price,quantity,order_date
0,1,Phone,North,1740,2,2023-11-16
1,2,Keyboard,East,1340,6,2023-12-03
2,3,Phone,North,1684,8,2023-08-07
3,4,Phone,East,537,2,2023-05-18
4,5,Phone,North,1930,3,2023-01-25
5,6,Phone,West,1491,8,2023-03-02
6,7,Keyboard,East,1706,4,2023-01-07
7,8,Mouse,South,480,4,2023-01-08
8,9,Mouse,South,1193,2,2023-11-01
9,10,Mouse,East,316,5,2023-08-14


Group by a column and sum


In [None]:
dataframe10.groupby("product").quantity.sum().reset_index()

Unnamed: 0,product,quantity
0,Keyboard,10
1,Mouse,11
2,Phone,23


Group by multiple columns and sum


In [None]:
dataframe10.groupby(["region", "product"]).quantity.sum().reset_index()

Unnamed: 0,region,product,quantity
0,East,Keyboard,10
1,East,Mouse,5
2,East,Phone,2
3,North,Phone,13
4,South,Mouse,6
5,West,Phone,8


Aggregate multiple functions


In [None]:
dataframe10.groupby("product").quantity.agg(
    ["mean", "sum", "count"]
).reset_index()

Unnamed: 0,product,mean,sum,count
0,Keyboard,5.0,10,2
1,Mouse,3.666667,11,3
2,Phone,4.6,23,5


##### Merging and Joining DataFrames


In [None]:
dataframe11 = pd.DataFrame({"Key": ["A", "B", "C"], "Value": [1, 2, 3]})
dataframe12 = pd.DataFrame({"Key": ["A", "B", "D"], "Value": [4, 5, 6]})

Merging inner

$A \cap B$


In [None]:
pd.merge(dataframe11, dataframe12, on="Key", how="inner")

Unnamed: 0,Key,Value_x,Value_y
0,A,1,4
1,B,2,5


Merging outer

$A \cup B$


In [None]:
pd.merge(dataframe11, dataframe12, on="Key", how="outer")

Unnamed: 0,Key,Value_x,Value_y
0,A,1.0,4.0
1,B,2.0,5.0
2,C,3.0,
3,D,,6.0


Merging left


In [107]:
pd.merge(dataframe11, dataframe12, on="Key", how="left")

Unnamed: 0,Key,Value_x,Value_y
0,A,1,4.0
1,B,2,5.0
2,C,3,


Merging right


In [None]:
pd.merge(dataframe11, dataframe12, on="Key", how="right")

Unnamed: 0,Key,Value_x,Value_y
0,A,1.0,4
1,B,2.0,5
2,D,,6
