# DataFrame
`DataFrame` là tập các `Series` có cùng `index`, đặt cạnh nhau tạo thành `table`. Mỗi `column` tương ứng với một `Series`

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Create DataFrame

# from dict : DataFrame(dict, index=[0, 1, 2, ...])
data = {
    'Name': ['user1', 'user2', 'user3'], # Series
    'Age': [20, 19, 24], # Series
    'Score': [3.67, 4., 3.32]
}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Age,Score
0,user1,20,3.67
1,user2,19,4.0
2,user3,24,3.32


In [3]:
# from list: DataFrame(list, columns, index=[0, 1, 2, ...]
data = [
    ['PTIT', 28],
    ['UET', 28.18],
    ['HUST', 29.18]
]
pd.DataFrame(data, columns=['University', 'Benchmark'], index=['A', 'B', 'C']).head()

Unnamed: 0,University,Benchmark
A,PTIT,28.0
B,UET,28.18
C,HUST,29.18


In [4]:
# from numpy array
data = np.array([[1, 'a'], [2, 'a'], [3, 'c']])
pd.DataFrame(data, columns=['Question', 'Answer'])

Unnamed: 0,Question,Answer
0,1,a
1,2,a
2,3,c


In [5]:
# read from .csv, excel, ..

# csv
df = pd.read_csv('data.csv')
df.index=np.random.choice(100, size=10, replace=True)
df.head()

# excel
# df = pd.read_excel("data.xlsx", sheet_name="Sheet1")

#json
# df = pd.read_json("data.json")

Unnamed: 0,A,B,C,D
28,0.187497,1.12215,-0.988277,-1.985934
46,0.360803,-0.562243,-0.340693,-0.986988
31,-0.040627,0.067333,-0.452978,0.686223
34,-0.279572,-0.702492,0.252265,0.958977
81,0.537438,-1.737568,0.714727,-0.939288


# Specify the index of DataFrame

In [6]:
# sử dụng tham số index khi khởi tạo df
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["Col-1", "Col-2", "Col-3"], index=['a', 'b'])
df.head()

Unnamed: 0,Col-1,Col-2,Col-3
a,1,2,3
b,4,5,6


In [7]:
# gán trực tiếp index
df.index = pd.date_range('20250830', periods=2, freq='MS') # monthstart
df.head()

Unnamed: 0,Col-1,Col-2,Col-3
2025-09-01,1,2,3
2025-10-01,4,5,6


In [8]:
df.index = [1, 2]
df.head()

Unnamed: 0,Col-1,Col-2,Col-3
1,1,2,3
2,4,5,6


# get index, values, columns

In [9]:
print(df.index, df.columns, df.values, sep='\n')

Index([1, 2], dtype='int64')
Index(['Col-1', 'Col-2', 'Col-3'], dtype='object')
[[1 2 3]
 [4 5 6]]


# Basic functionality

In [10]:
# .infor()
df.info()

# .describe()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, 1 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Col-1   2 non-null      int64
 1   Col-2   2 non-null      int64
 2   Col-3   2 non-null      int64
dtypes: int64(3)
memory usage: 64.0 bytes


Unnamed: 0,Col-1,Col-2,Col-3
count,2.0,2.0,2.0
mean,2.5,3.5,4.5
std,2.12132,2.12132,2.12132
min,1.0,2.0,3.0
25%,1.75,2.75,3.75
50%,2.5,3.5,4.5
75%,3.25,4.25,5.25
max,4.0,5.0,6.0


In [11]:
# mean(), count(), std(), min(), max(), ... 
# arg=0 -> for each column
# arg=1 -> for each row
df.mean(0)

Col-1    2.5
Col-2    3.5
Col-3    4.5
dtype: float64

In [12]:
df.mean(1)

1    2.0
2    5.0
dtype: float64

In [13]:
df[["Col-1", "Col-3"]].max()

Col-1    4
Col-3    6
dtype: int64

In [14]:
# .nunique() -> number unique
df.nunique()
df[['Col-1', 'Col-2']].nunique()

Col-1    2
Col-2    2
dtype: int64