# 数据处理


## math 数学工具

$\text{\color{red}只适用于单个数值计算}$


### 特殊数值


In [27]:
import math

vals = {
    "pi": math.pi,  # 圆周率，3.141592653589793
    "e": math.e,  # 自然常数e，2.718281828459045
    "tau": math.tau,  # 圆周率的两倍（2π）
    "inf": math.inf,  # 无穷大
    "nan": math.nan,  # 非数字（Not a Number）
}
for key, value in vals.items():
    print(f"{key}: {value}")

pi: 3.141592653589793
e: 2.718281828459045
tau: 6.283185307179586
inf: inf
nan: nan


In [None]:
from math import (
    gcd,
    lcm,
    factorial,
    sqrt,
    isqrt,
    comb,
    ceil,
    floor,
    trunc,
    fabs,
    exp,
    log,
    log2,
    log10,
    pow,
)

### 数值操作


In [29]:
x = 12.23
y = 702

operations = {
    "ceil(x)": ceil(x),  # 向上取整
    "floor(x)": floor(x),  # 向下取整
    "trunc(x)": trunc(x),  # 截断为整数（向0取整，也是去掉小数）
    "fabs(y)": fabs(y),  # 绝对值（返回float）
}

for key, value in operations.items():
    print(f"{key}: {value}")

ceil(x): 13
floor(x): 12
trunc(x): 12
fabs(y): 702.0


### math.gcd() 最大公因数 math.lcm() 最小公倍数


In [30]:
operations = {
    "24, 18最大公约数": gcd(24, 18),  # 最大公约数（仅限整数）
    "24, 18最小公倍数": lcm(24, 18),  # 最小公倍数
}
for key, value in operations.items():
    print(f"{key}: {value}")

24, 18最大公约数: 6
24, 18最小公倍数: 72


### 数值计算


In [32]:
x = 10
y = 2
operations = {
    "factorial(x)：": factorial(x),  # 120
    "exp(x)": exp(x),  # e的x次幂
    "sqrt(x)": sqrt(x),  # 平方根
    "isqrt(x)": isqrt(x),  # 整数平方根（返回整数）
    "exp(x)": exp(x),  # e的x次幂
    "log(x)": log(x),  # 自然对数 ln(x)
    "log10(x)": log10(x),  # 以10为底的对数
    "log2(x)": log2(x),  # 以2为底的对数
    "pow(x, y)": pow(x, y),  # x的y次幂（返回float
}
for key, value in operations.items():
    print(f"{key}: {value}")

factorial(x)：: 3628800
exp(x): 22026.465794806718
sqrt(x): 3.1622776601683795
isqrt(x): 3
log(x): 2.302585092994046
log10(x): 1.0
log2(x): 3.321928094887362
pow(x, y): 100.0


### math.comb(m,n) 组合

实现$C^{n}_{m}$，功能类似于 iterools.combinations()，只不过这个只计算组合数量


In [None]:
print(comb(5, 2))  # 10  (5 选 2 组合)

10


## statistics 统计工具

$\text{\color{red}适用于list和tuple小规模计算}$


In [None]:
import statistics as stats  # 统计模块，这种方式导入模块可以给它起个别名

### 中心趋势


In [39]:
data = [1, 3, 3, 6, 7, 8, 9, 10]
results = {
    "mean(data)": stats.mean(data),  # 平均值
    "median(data)": stats.median(data),  # 中位数
    "mode(data)": stats.mode(data),  # 众数（最常见的值）
    "multimode(data)": stats.multimode(data),  # 返回所有众数列表（Python 3.8+）
    "median_low(data)": stats.median_low(data),  # 取中间偏小
    "median_high(data)": stats.median_high(data),  # 取中间偏大
}
for key, value in results.items():
    print(f"{key}: {value}")

mean(data): 5.875
median(data): 6.5
mode(data): 3
multimode(data): [3]
median_low(data): 6
median_high(data): 7


### 离散程度


In [41]:
results = {
    "样本方差 variance": stats.variance(data),  # 样本方差（除以n-1）
    "总体方差 pvariance": stats.pvariance(data),  # 总体方差（除以n）
    "样本标准差 standard deviation": stats.stdev(data),  # 样本标准差 = sqrt(variance)
    "总体标准差 population standard deviation": stats.pstdev(
        data
    ),  # 总体标准差 = sqrt(pvariance)
}
for key, value in results.items():
    print(f"{key}: {value}")

样本方差 variance: 10.410714285714286
总体方差 pvariance: 9.109375
样本标准差 standard deviation: 3.2265638511757806
总体标准差 population standard deviation: 3.018174116912409


## numpy 大规模计算工具

$\text{\color{red}支持大规模数据，数组和矩阵的计算}$


In [47]:
import numpy as np

data = [1, 2, 3, 4]

results = {
    "numpy.mean(data)": np.mean(data),  # 平均值
    "numpy.median(data)": np.median(data),  # 中位数
    "numpy.std(data)": np.std(data),  # 标准差
    "numpy.var(data)": np.var(data),  # 方差
    "numpy.sqrt(data)": np.sqrt(data),  # 平方根
    "numpy.log(data)": np.log(data),  # 自然对数
    "numpy.log10(data)": np.log10(data),  # 以10为底的对数
    "numpy.log2(data)": np.log2(data),  # 以2为底的对数
    "numpy.exp(data)": np.exp(data),  # e的x次幂
    "numpy.power(data, 2)": np.power(data, 2),  # x的y次幂
    "numpy.ceil(data)": np.ceil(data),  # 向上取整
    "numpy.floor(data)": np.floor(data),  # 向下取整
    "numpy.trunc(data)": np.trunc(data),  # 截断为整数（向0取整）
    "numpy.abs(data)": np.abs(data),  # 绝对值
    "numpy.sum(data)": np.sum(data),  # 求和
    "numpy.prod(data)": np.prod(data),  # 乘积
    "numpy.min(data)": np.min(data),  # 最小值
    "numpy.max(data)": np.max(data),  # 最大值
}
for key, value in results.items():
    print(f"{key}: {value}")
import pandas as pd

numpy.mean(data): 2.5
numpy.median(data): 2.5
numpy.std(data): 1.118033988749895
numpy.var(data): 1.25
numpy.sqrt(data): [1.         1.41421356 1.73205081 2.        ]
numpy.log(data): [0.         0.69314718 1.09861229 1.38629436]
numpy.log10(data): [0.         0.30103    0.47712125 0.60205999]
numpy.log2(data): [0.        1.        1.5849625 2.       ]
numpy.exp(data): [ 2.71828183  7.3890561  20.08553692 54.59815003]
numpy.power(data, 2): [ 1  4  9 16]
numpy.ceil(data): [1 2 3 4]
numpy.floor(data): [1 2 3 4]
numpy.trunc(data): [1 2 3 4]
numpy.abs(data): [1 2 3 4]
numpy.sum(data): 10
numpy.prod(data): 24
numpy.min(data): 1
numpy.max(data): 4


## pandas 数据表格


## re, textwrap 字符串处理


In [None]:
import re
import textwrap

text = "hello 123 world 456"
print(re.findall(r"\d+", text))  # ['123', '456']
print(re.sub(r"\d+", "X", text))  # 'hello X world X'

wrapped_text = textwrap.wrap(
    "This is a very long sentence that needs wrapping.", width=10
)
print("\n".join(wrapped_text))  # 自动换行