# Pandas


Pandas is a data manipulation library in python, used for data analysis and cleaning.

It provides two primary structures:

- **Series**: one-dimension array like object.
- **DataFrames**: two-dimensional, size-mutable tabular data structure.


Install packages


In [102]:
!uv pip install -q \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.0


Import packages


In [None]:
from datetime import datetime
from io import StringIO

import numpy as np
import pandas as pd

## Series


Create series from List


In [None]:
data_from_list = [1, 2, 3, 4, 5]

series_from_list = pd.Series(data_from_list)

print(f"Series:\n{series_from_list}")
print(type(series_from_list))

Series:
0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>


Create Series from Dict


In [None]:
data_from_dict = {"a": 1, "b": 2}

series_from_dict = pd.Series(data_from_dict)

print(f"Series:\n{series_from_dict}")

Series:
a    1
b    2
dtype: int64


Working with Series indexes


In [None]:
data_from_series_with_index = [10, 20, 30]
series_index = ["a", "b", "c"]

series_from_data_with_index = pd.Series(
    data_from_series_with_index, index=series_index
)

print(f"Series:\n{series_from_data_with_index}")

Series:
a    10
b    20
c    30
dtype: int64


## Data frames


### Creating DataFrames


DataFrame from a list of lists


In [None]:
data_from_list_of_lists = [
    [1, "Alice", 25],
    [2, "Bob", 30],
    [3, "Charlie", 22],
]

columns_for_list_of_lists = ["ID", "Name", "Age"]

df_from_list_of_lists = pd.DataFrame(
    data_from_list_of_lists,
    columns=columns_for_list_of_lists,
)

df_from_list_of_lists

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30
2,3,Charlie,22


DataFrame from a dictionary of lists


In [None]:
data_from_dict_of_lists = {
    "Name": ["Pedro", "James", "John"],
    "Age": [33, 27, 52],
}

dataframe_dict_of_lists = pd.DataFrame(data_from_dict_of_lists)
dataframe_dict_of_lists

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27
2,John,52


DataFrame from a list of dicts


In [None]:
data_from_list_of_dicts = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_from_list_of_dicts = pd.DataFrame(data_from_list_of_dicts)
dataframe_from_list_of_dicts

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27
2,John,52


DataFrame from json string


In [None]:
data_from_json_string = '[{"Name": "Pedro", "Age": 33}]'

dataframe_from_json_string = pd.read_json(StringIO(data_from_json_string))
dataframe_from_json_string

Unnamed: 0,Name,Age
0,Pedro,33


Convert DataFrame back to json


In [None]:
dataframe_from_json_string.to_json()

'{"Name":{"0":"Pedro"},"Age":{"0":33}}'

Change DataFrame orientation to index


In [None]:
dataframe_from_json_string.to_json(orient="index")

'{"0":{"Name":"Pedro","Age":33}}'

Change DataFrame orientation to records


In [None]:
dataframe_from_json_string.to_json(orient="records")

'[{"Name":"Pedro","Age":33}]'

Dataframe data types


In [None]:
dataframe_from_json_string.dtypes

Name    object
Age      int64
dtype: object

First rows of a DataFrame


In [None]:
data_for_methods_examples = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_for_methods_examples = pd.DataFrame(data_for_methods_examples)

dataframe_for_methods_examples.head(n=2)

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27


Information about a DataFrame


In [None]:
dataframe_for_methods_examples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes


Describing a DataFrame


In [None]:
dataframe_for_methods_examples.describe()

Unnamed: 0,Age
count,3.0
mean,37.333333
std,13.051181
min,27.0
25%,30.0
50%,33.0
75%,42.5
max,52.0


Last rows of a DataFrame


In [None]:
dataframe_for_methods_examples.tail(2)

Unnamed: 0,Name,Age
1,James,27
2,John,52


Renaming columns in a DataFrame


In [None]:
dataframe_for_methods_examples.rename({"Name": "First Name"})

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27
2,John,52


### Accessing DataFrame elements


Accessing a DataFrame Serie


In [None]:
print(dataframe_for_methods_examples["Name"])
print(type(dataframe_for_methods_examples["Name"]))

0    Pedro
1    James
2     John
Name: Name, dtype: object
<class 'pandas.core.series.Series'>


### DataFrame Index


In [None]:
data_for_index_usage = [
    [1, "Alice", 25],
    [2, "Bob", 30],
    [3, "Charlie", 22],
]

columns_for_index_usage = ["ID", "Name", "Age"]

dataframe_for_index_usage = pd.DataFrame(
    data_for_index_usage, columns=columns_for_index_usage
)

dataframe_for_index_usage.head(n=2)

Unnamed: 0,ID,Name,Age
0,1,Alice,25
1,2,Bob,30


In [None]:
dataframe_for_index_usage.index

RangeIndex(start=0, stop=3, step=1)

Replacing index


In [None]:
dataframe_for_index_usage.index = ["a", "b", "c"]

dataframe_for_index_usage.head(n=2)

Unnamed: 0,ID,Name,Age
a,1,Alice,25
b,2,Bob,30


Accessing a DataFrame row index


In [None]:
dataframe_for_index_usage.loc["a"]

ID          1
Name    Alice
Age        25
Name: a, dtype: object

Accessing a DataFrame by position with iloc


In [None]:
dataframe_for_index_usage.iloc[1]

ID        2
Name    Bob
Age      30
Name: b, dtype: object

Accessing a value from a specific row/column at index with at


In [None]:
dataframe_for_index_usage.at["a", "Age"]

np.int64(25)

Accessing a value from a specific row/column at position with iat


In [None]:
dataframe_for_index_usage.iat[2, 1]

'Charlie'

Resetting index


In [None]:
dataframe_for_index_usage.reset_index()

Unnamed: 0,index,ID,Name,Age
0,a,1,Alice,25
1,b,2,Bob,30
2,c,3,Charlie,22


### Modifying DataFrames


In [None]:
data_for_manipulation = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_for_manipulation = pd.DataFrame(data_for_manipulation)
dataframe_for_manipulation.head(2)

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27


Adding a new column to a DataFrame


In [None]:
dataframe_for_manipulation["City"] = ["New York", "Florida", "Los Angeles"]
dataframe_for_manipulation

Unnamed: 0,Name,Age,City
0,Pedro,33,New York
1,James,27,Florida
2,John,52,Los Angeles


Dropping a column, returns a new DataFrame unless inplace=True


In [None]:
dataframe_for_manipulation.drop("City", axis=1)  # axis=0 rows, axis=1 columns

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27
2,John,52


Deleting a column, modifies the original DataFrame


In [None]:
del dataframe_for_manipulation["City"]

Removing a row


In [None]:
dataframe_for_manipulation.drop(1, axis=0)

Unnamed: 0,Name,Age
0,Pedro,33
2,John,52


Element-wise operations on DataFrames


In [None]:
dataframe_for_manipulation["Age"] = dataframe_for_manipulation["Age"] + 1
dataframe_for_manipulation

Unnamed: 0,Name,Age
0,Pedro,34
1,James,28
2,John,53


### Filtering DataFrames


In [None]:
dataframe_for_filtering = pd.DataFrame(
    [
        {"Name": "Pedro", "Age": 33},
        {"Name": "James", "Age": 27},
        {"Name": "John", "Age": 52},
        {"Name": "Alice", "Age": 30},
        {"Name": "Bob", "Age": 22},
    ]
)

Filter DataFrame based on a condition


In [None]:
dataframe_for_filtering[
    (dataframe_for_filtering["Age"] > 30)
    & (dataframe_for_filtering["Name"].str.startswith("J"))
]

Unnamed: 0,Name,Age
2,John,52


### String operations on DataFrames


In [None]:
dataframe_for_string_operations = pd.DataFrame(
    {
        "Name": ["Pedro", "James", "John"],
        "City": ["New York", "Florida", "Los Angeles"],
    }
)

dataframe_for_string_operations["Name"].str.lower()

0    pedro
1    james
2     john
Name: Name, dtype: object

In [160]:
dataframe_for_string_operations["City"].str.replace(" ", "_")

0       New_York
1        Florida
2    Los_Angeles
Name: City, dtype: object

### Handling missing data


Handling missing values


In [None]:
data_with_missing = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": None},
    {"Name": "John", "Age": 52},
]

dataframe_with_missing = pd.DataFrame(data_with_missing)

dataframe_with_missing.isnull().any()

Name    False
Age      True
dtype: bool

In [None]:
dataframe_with_missing.isnull().sum()

Name    0
Age     1
dtype: int64

In [None]:
dataframe_with_missing[dataframe_with_missing.isnull().any(axis=1)]

Unnamed: 0,Name,Age
1,James,


In [None]:
dataframe_with_missing.fillna(0)

Unnamed: 0,Name,Age
0,Pedro,33.0
1,James,0.0
2,John,52.0


In [None]:
dataframe_with_missing["Age_fill_NA"] = dataframe_with_missing["Age"].fillna(
    dataframe_with_missing["Age"].mean()
)
dataframe_with_missing

Unnamed: 0,Name,Age,Age_fill_NA
0,Pedro,33.0,33.0
1,James,,42.5
2,John,52.0,52.0


### Casting Data


In [None]:
data_for_casting = [
    {"Name": "Pedro", "Age": 33.0},
    {"Name": "James", "Age": 27.0},
    {"Name": "John", "Age": 52.0},
]

dataframe_for_casting = pd.DataFrame(data_for_casting)

dataframe_for_casting.head()

Unnamed: 0,Name,Age
0,Pedro,33.0
1,James,27.0
2,John,52.0


Casting a DataFrame Series to integer


In [None]:
dataframe_for_casting["Age"] = dataframe_for_casting["Age"].astype(int)
dataframe_for_casting

Unnamed: 0,Name,Age
0,Pedro,33
1,James,27
2,John,52


### Applying function on column


Applying a lambda function to a DataFrame Series


In [None]:
current_year = datetime.now().year

dataframe_for_casting["BirthYear"] = dataframe_for_casting["Age"].apply(
    lambda x: current_year - x
)

dataframe_for_casting.head()

Unnamed: 0,Name,Age,BirthYear
0,Pedro,33,1992
1,James,27,1998
2,John,52,1973


### Aggregation


Make a mock dataset


In [None]:
size = 10

start_date = pd.to_datetime("2023-01-01")

data_for_aggregation = {
    "order_id": np.arange(1, size + 1),
    "product": np.random.choice(
        ["Phone", "Laptop", "Keyboard", "Mouse", "Monitor"], size=size
    ),
    "region": np.random.choice(["North", "South", "East", "West"], size=size),
    "price": np.random.randint(100, 2000, size=size),
    "quantity": np.random.randint(1, 10, size=size),
    "order_date": start_date
    + pd.to_timedelta(np.random.randint(0, 365, size=size), unit="D"),
}

dataframe_for_aggregation = pd.DataFrame(data_for_aggregation)

dataframe_for_aggregation.head(10)

Unnamed: 0,order_id,product,region,price,quantity,order_date
0,1,Phone,North,243,3,2023-06-30
1,2,Phone,South,1769,8,2023-10-01
2,3,Mouse,South,1587,4,2023-07-15
3,4,Phone,West,984,1,2023-01-25
4,5,Phone,West,1200,7,2023-07-29
5,6,Laptop,East,611,4,2023-11-22
6,7,Laptop,West,300,8,2023-11-19
7,8,Keyboard,North,1386,3,2023-12-10
8,9,Laptop,South,1123,3,2023-10-08
9,10,Mouse,West,1997,6,2023-10-23


Group by a column and sum


In [None]:
dataframe_for_aggregation.groupby("product").quantity.sum().reset_index()

Unnamed: 0,product,quantity
0,Keyboard,3
1,Laptop,15
2,Mouse,10
3,Phone,19


Group by multiple columns and sum


In [None]:
dataframe_for_aggregation.groupby(
    ["region", "product"]
).quantity.sum().reset_index()

Unnamed: 0,region,product,quantity
0,East,Laptop,4
1,North,Keyboard,3
2,North,Phone,3
3,South,Laptop,3
4,South,Mouse,4
5,South,Phone,8
6,West,Laptop,8
7,West,Mouse,6
8,West,Phone,8


Aggregate multiple functions


In [None]:
dataframe_for_aggregation.groupby("product").quantity.agg(
    ["mean", "sum", "count"]
).reset_index()

Unnamed: 0,product,mean,sum,count
0,Keyboard,3.0,3,1
1,Laptop,5.0,15,3
2,Mouse,5.0,10,2
3,Phone,4.75,19,4


### Merging and Joining DataFrames


In [None]:
dataframe_for_join_a = pd.DataFrame(
    {"Key": ["A", "B", "C"], "Value": [1, 2, 3]}
)
dataframe_for_join_b = pd.DataFrame(
    {"Key": ["A", "B", "D"], "Value": [4, 5, 6]}
)

Merging inner

$A \cap B$


In [None]:
pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="inner")

Unnamed: 0,Key,Value_x,Value_y
0,A,1,4
1,B,2,5


Merging outer

$A \cup B$


In [None]:
pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="outer")

Unnamed: 0,Key,Value_x,Value_y
0,A,1.0,4.0
1,B,2.0,5.0
2,C,3.0,
3,D,,6.0


Merging left


In [None]:
pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="left")

Unnamed: 0,Key,Value_x,Value_y
0,A,1,4.0
1,B,2,5.0
2,C,3,


Merging right


In [None]:
pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="right")

Unnamed: 0,Key,Value_x,Value_y
0,A,1.0,4
1,B,2.0,5
2,D,,6
