In [None]:
import pandas as pd
import numpy as np

# matpltolib, seaborn
# matpltolib是比较底层
# seaborn是基于matpltolib

# https://matplotlib.org/stable/tutorials/index.html
# https://seaborn.pydata.org/examples/index.html
import seaborn as sns

%pylab inline

# 这两行代码解决 plt 中文显示的问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg') # 展示的图片格式

from sklearn.model_selection import KFold

In [None]:
train_df = pd.read_csv('./糖尿病遗传风险预测挑战赛公开数据/比赛训练集.csv', encoding='gbk')
test_df = pd.read_csv('./糖尿病遗传风险预测挑战赛公开数据/比赛测试集.csv', encoding='gbk')

In [None]:
train_df.head()

# 病人的信息

# 相同性别下 体重指数的均值~

# 特征要与原始样本个数相同！

In [None]:
train_df.groupby('性别')['体重指数'].mean()

In [None]:
# 相同性别下 体重指数与均值的差异~

train_df['体重指数'] - train_df['性别'].map(
    train_df.groupby('性别')['体重指数'].mean()
)

In [None]:
train_df['性别'].map(
    train_df.groupby('性别')['体重指数'].mean()
)

# Pandas数据分组和聚合

- 行索引、列索引、条件索引
- 分组聚合

## groupby

In [None]:
import pandas as pd
df = pd.read_csv("https://cdn.coggle.club/Pokemon.csv")
df.head(5)

In [None]:
# 迭代打印所有的分组数据
for tmp_df in df.groupby('Type 1'):
    # key， 表格
    print(tmp_df[0], tmp_df[1].shape)

In [None]:
# 根据Type 1分组统计HP的最大值
df.groupby("Type 1")["HP"].max()

In [None]:
# 根据Type 1分组统计Legendary的取值个数
df.groupby("Type 1")["Legendary"].value_counts()

## apply

In [None]:
# 根据Type 1分组统计HP的最大值
df.groupby("Type 1")["HP"].apply(np.max)

# df.groupby("Type 1")["HP"].max()

In [None]:
# 根据Type 1分组统计HP的最大值
df.groupby("Type 1")["HP"].apply(lambda x: np.max(x))

In [None]:
# 根据Type 1分组统计HP的中位数
df.groupby("Type 1")["HP"].apply(lambda x: np.percentile(x, 0.5))

## agg

In [None]:
df.groupby("Type 1")["HP"].agg(['mean', 'min', 'max']) 

# df.groupby("Type 1")["HP"].max()

In [None]:
# df.groupby("Type 1")["HP"].agg(['mean', 'min', 'max']) 

df.groupby("Type 1").agg({
    'HP': ['mean', 'max', 'min'],
    'Attack': ['mean', 'max', 'min'],
})

## transform

In [None]:
df.head()

# 原始数据的每一行：一个宠物
# Type 1分组下面 HP 最大值 

In [None]:
df["Type 1"].map(
    {'Grass': 1, 'Fire': 2}
)

In [None]:
df.groupby("Type 1")["HP"].max()

In [None]:
df["Type 1"].map(
    df.groupby("Type 1")["HP"].max() # series 【index】
)

In [None]:
df.groupby("Type 1")["HP"].transform('max') # 返回的就是和样本个数一样多的一个特征！

In [None]:
df.groupby("Type 1")["HP"].max()

# 可视化

In [None]:
train_df['性别'].value_counts()

In [None]:
train_df['性别'].value_counts().plot(kind='bar')
# plot pandas里面的函数，对series进行可视化
# kind 表示画图方法
# plot 底层也是matplotlib

In [None]:
train_df['性别'].value_counts().plot(kind='barh')

In [None]:
train_df.groupby(['患有糖尿病标识'])['性别'].value_counts()

In [None]:
sns.countplot(x='患有糖尿病标识', hue='性别', data=train_df)

In [None]:
sns.boxplot(y='出生年份', x='患有糖尿病标识', data=train_df)

In [None]:
sns.boxplot(y='出生年份', x='患有糖尿病标识', hue='性别', data=train_df)

In [None]:
sns.boxplot(y='体重指数', x='患有糖尿病标识', hue='性别', data=train_df)

In [None]:
sns.violinplot(y='体重指数', x='患有糖尿病标识', hue='性别', data=train_df)

In [None]:
sns.kdeplot(x='体重指数', data=train_df)

In [None]:
sns.violinplot(y='舒张压', x='患有糖尿病标识', hue='性别', data=train_df)

In [None]:
sns.scatterplot(y='舒张压', x='体重指数', hue='患有糖尿病标识',data=train_df)

# 不同的画图方法 X Y 具体的类型是不同的！
# 【参与画图的数据类型 选择 合适的图形】

In [None]:
sns.kdeplot(y='舒张压', x='体重指数', hue='患有糖尿病标识',data=train_df, fill=True, alpha=0.5)


In [None]:
sns.violinplot(y='口服耐糖量测试', x='患有糖尿病标识', hue='性别', data=train_df)

In [None]:
sns.scatterplot(y='口服耐糖量测试', x='体重指数', hue='患有糖尿病标识',data=train_df)

In [None]:
sns.scatterplot(y='口服耐糖量测试', x='胰岛素释放实验', hue='患有糖尿病标识',data=train_df)

In [None]:
sns.scatterplot(y='肱三头肌皮褶厚度', x='胰岛素释放实验', hue='患有糖尿病标识',data=train_df)

In [None]:
sns.violinplot(y='肱三头肌皮褶厚度', x='患有糖尿病标识', hue='性别', data=train_df)

# 特征工程

In [None]:
import pandas as pd
df = pd.DataFrame({
    'student_id': [1,2,3,4,5,6,7],
    'country': ['China', 'USA', 'UK', 'Japan', 'Korea', 'China', 'USA'],
    'education': ['Master', 'Bachelor', 'Bachelor', 'Master', 'PHD', 'PHD', 'Bachelor'],
    'target': [1, 0, 1, 0, 1, 0, 1]
})
df.head(10)
df.style.set_table_attributes('style="font-size: 17px"')

### One Hot Encoding

In [None]:
pd.get_dummies(df, columns=['education'])

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit_transform(df[['country']]).toarray()

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['country_LabelEncoder'] = le.fit_transform(df['country'])
df.head(10)

In [None]:
df['country_LabelEncoder'] = pd.factorize(df['country'])[0]
df.head(10)

### Ordinal Encoding

In [None]:
df['education'] = df['education'].map(
                    {'Bachelor': 1, 
                    'Master': 2, 
                    'PHD': 3})
df.head(10)

In [None]:
pd.Series(['Bachelor','Master','PHD', 'Postdoctoral']).map(
                    {'Bachelor': 1, 
                    'Master': 2, 
                    'PHD': 3})

In [None]:
import category_encoders as ce
encoder = ce.SumEncoder(cols= ['country'])

pd.concat([df, encoder.fit_transform(df['country'], df['target']).iloc[:, 1:]], axis=1)