# Pandas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Series

In [None]:
pd.Series([1.2, 2.2, 3.6, 5.9], index=[1, 2, 3, 4])

In [None]:
# 字典型
color_count = pd.Series({"red": 10, "Blue": 20, "Black": 30})
color_count

## Series 属性    

In [None]:
# 获取index
color_count.index

In [None]:
# 获取值
color_count.values

In [None]:
# 获取index对应的值
color_count[0]

## DataFrame
可以给数据加上横纵标签

In [None]:
score = np.random.randint(40, 100, (10, 5))

In [None]:
print(score)

In [None]:
subjects = ["语文", "数学", "英语", "政治", "体育"]
stu = ["Student" + str(i) for i in range(score.shape[0])]
data = pd.DataFrame(score, columns=subjects, index=stu)

## DataFrame属性

In [None]:
data.shape

In [None]:
data.index

In [None]:
data.columns

In [None]:
data.T

In [None]:
data.head(3)

## MultiIndex

In [None]:
arrays = [[1, 1, 2, 2], ["r", "b", "r", "b"]]
pd.MultiIndex.from_arrays(arrays, names=("num", "col"))

# 基本数据操作
## 1. 取值

In [None]:
data = pd.read_csv("cars.csv")
data.head()

In [None]:
# 取数据-先列后行
data["mpg"][1]

In [None]:
# 区间取数据
data.loc[1:6, "mpg":"weight"]

In [None]:
# 下标索引
data.iloc[:5, :3]  # 前五行，前三列

## 2. 赋值

In [None]:
data["cylinders"] = 1  # 全部变为1

In [None]:
data

## 3. 排序

In [None]:
data

In [None]:
data.sort_values(by="mpg", ascending=False)  # 降序
data.sort_values(by="mpg", ascending=True)  # 升序

In [None]:
data.sort_values(by=["mpg", "horsepower"], ascending=True)  # 按照两个值排序，第一个一样就排第二个

In [None]:
data.sort_index()  # 按照索引排序

In [None]:
data["horsepower"].sort_values()  # 按照值排序

# DataFrame 运算
## 1. 算数运算

In [None]:
data

In [None]:
data["mpg"].add(10)

## 2. 逻辑运算

In [None]:
data[data["mpg"] > 25]

In [None]:
data[(data["mpg"] > 23) & (data["mpg"] > 24)]

In [None]:
data.query("mpg < 24 & displacement > 400")

## 3. 统计运算

In [None]:
data.describe()  # 综合统计

In [None]:
data.idxmin

In [None]:
data

In [None]:
# 累计统计函数 & 绘制
data = data.sort_index()
data.head()

In [None]:
smg_rise = data["mpg"]
smg_rise.cumsum().plot()
plt.show()

# 数据处理
## 1.判断缺失值与处理

In [None]:
import pandas as pd
import numpy as np

# Attribute names
attribute_names = [
    "erythema", "scaling", "definite borders", "itching", "koebner phenomenon",
    "polygonal papules", "follicular papules", "oral mucosal involvement",
    "knee and elbow involvement", "scalp involvement", "family history",
    "melanin incontinence", "eosinophils in the infiltrate", "PNL infiltrate",
    "fibrosis of the papillary dermis", "exocytosis", "acanthosis", "hyperkeratosis",
    "parakeratosis", "clubbing of the rete ridges", "elongation of the rete ridges",
    "thinning of the suprapapillary epidermis", "spongiform pustule", "munro microabcess",
    "focal hypergranulosis", "disappearance of the granular layer",
    "vacuolisation and damage of basal layer", "spongiosis", "saw-tooth appearance of retes",
    "follicular horn plug", "perifollicular parakeratosis", "inflammatory monoluclear inflitrate",
    "band-like infiltrate", "Age", "Class"
]

# Load the dermatology.data file from the specified path into a pandas DataFrame
file_path = "cw1/dermatology/dermatology.data"
dermatology_df = pd.read_csv(file_path, header=None, names=attribute_names, sep=",")

# Replace '?' with NaN
dermatology_df.replace("?", np.nan, inplace=True)

In [None]:
np.all(pd.notnull(dermatology_df))  # 证明有缺失值

In [None]:
np.any(pd.isnull(dermatology_df))  # 证明有缺失值

In [None]:
dermatology_df = dermatology_df.replace(to_replace="?", value=np.nan) # 处理"?"数据
dermatology_df

In [None]:
# 判断那一列有缺失值并替换
for i in dermatology_df.columns:
    if np.any(pd.isnull(dermatology_df[i])):
        print(i)
        dermatology_df[i].fillna(dermatology_df[i].mean(), inplace=True)

## 2.数据离散化

In [None]:
Ages = dermatology_df["Age"]
Ages = Ages.replace(to_replace="?", value=np.nan)
Ages = Ages.dropna()  # 删除含有 NaN 的行
Ages = Ages.astype(int)  # 将Series转化为整数类型

In [122]:
# 自动分类
Ages = pd.qcut(Ages, 10)
Ages.value_counts()

Age
(45.9, 51.0]      42
(17.0, 22.0]      41
(35.0, 40.0]      40
(-0.001, 17.0]    38
(22.0, 27.0]      38
(32.0, 35.0]      34
(56.0, 75.0]      34
(51.0, 56.0]      32
(40.0, 45.9]      30
(27.0, 32.0]      29
Name: count, dtype: int64

In [124]:
# 自定义分组
bins = [20, 30, 40, 50, 60, 70]
Ages = pd.cut(Ages, bins)
Ages.value_counts()

Age
(30, 40]    82
(20, 30]    77
(40, 50]    65
(50, 60]    53
(60, 70]    19
Name: count, dtype: int64