### 导入相关的包和加载示例数据

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
#seaborn里有数据集，可以直接加载使用
titanic_df = sns.load_dataset('titanic')
#查看前5条的数据
titanic_df.head() 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### 查看数据量和概览

In [8]:
print(titanic_df.shape)
#输出 （891，15）
#查看DataFrame的基本信息，包含索引、字段名称、非空值统计、字段类型。
titanic_df.info()

(891, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


### 查看数值型变量的描述统计信息，数量、均值、标准差、最大最小值、分位数

In [9]:
titanic_df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


#### 条形图

In [23]:
sns.countplot(titanic_df["embarked"])

![image.png](./imgs/5.jpg)

#### 饼图

In [None]:
#先对类别变量'embarked'进行聚合计算各类别频数
embarked_cnt=titanic_df.groupby('embarked')['embarked'].count()
#使用pandas 接口画饼图
embarked_cnt.plot.pie(autopct='%1.2f%%', figsize=(6, 6))

![image.png](./imgs/6.jpg)

#### 直方图

In [None]:
#输入空值会报错需要剔除空值
sns.distplot(titanic_df[titanic_df.age.isnull().values == False]['age'],kde=False)

![image.png](./imgs/7.jpg)

#### 箱型图

In [None]:
sns.boxplot(titanic_df['age'])

![image.png](./imgs/8.jpg)

#### 小提琴图

In [None]:
sns.violinplot(y=titanic_df['age'])

![image.png](./imgs/9.jpg)

##### 散点图

In [None]:
#查看年龄和舱位等级散点图
sns.swarmplot(y='age', x='pclass', data=titanic_df)

![image.png](./imgs/10.jpg)
通过散点图可以直观地发现，买得起头等舱(pclass=1)的年龄相对大一些，普通舱(pclass=2)则以青年为主，符合我们的认知。

##### 相关性图

In [None]:
from scipy.stats import pearsonr
#查看年龄和舱位等级相关性
sns.jointplot(x="pclass", y="age", data=titanic_df, 
kind="reg",stat_func=pearsonr)

##### 堆积柱形图

In [14]:
var = titanic_df.groupby(['embarked', 'who']).who.count()
var.unstack().plot(kind='bar', stacked=True, color=['red', 'blue', 'green'], figsize=(8, 6))
plt.show()

![image.png](./imgs/12.jpg)

##### 卡方检验

In [2]:
#导入scipy.stats.chi2_contingency包
from scipy.stats import chi2_contingency
#计算embarked和alive交叉列联表 
embarked_alive = pd.crosstab(titanic_df.embarked, titanic_df.alive)
chi2, p, dof, ex = chi2_contingency(embarked_alive)
print('卡方值：{}'.format(chi2))
print('p_value：{}'.format(p))
print('自由度：{}'.format(dof))
print('期望频率：{}'.format(ex))

卡方值：26.48914983923762
p_value：1.769922284120912e-06
自由度：2
期望频率：[[103.7480315  64.2519685]
 [ 47.5511811  29.4488189]
 [397.7007874 246.2992126]]


##### 带误差条的线图

In [None]:
plt.errorbar(x=titanic_df.groupby([ 'class']).age.mean().index,
             y=titanic_df.groupby(['class']).age.mean(),
             yerr=titanic_df.groupby(['class']).age.std(),
             fmt="o")

plt.xlabel('class')
plt.ylabel('age')
plt.show()

![image.png](./imgs/13.jpg)

##### 组合图

In [None]:
ax = sns.boxplot(x="alive", y="age", data=titanic_df)
ax = sns.swarmplot(x="alive", y="age", data=titanic_df, color=".8")

![image.png](./imgs/14.jpg)

#### Z检验和t检验

In [3]:
#导入scipy.stats.ttest_ind
from scipy.stats import ttest_ind
#剔除age空值 
titanic_age=titanic_df[titanic_df.age.isnull().values == False]

t_statistics, p = ttest_ind(titanic_age[titanic_age['sex'] == 'female'].age,
                            titanic_age[titanic_age['sex'] == 'male'].age)

print('t值：{}'.format(t_statistics))
print('p_value：{}'.format(p))

t值：-2.499206354920835
p_value：0.012671296797013709


#### 多变量可视化分析

In [None]:
#导入数据
titanic = sns.load_dataset('titanic')
#对缺失值进行简单的填充
titanic['age'].fillna(titanic['age'].median(), inplace=True)

##### 多图网格

In [None]:
sns.FacetGrid(titanic, hue="alive", size=6).map(plt.scatter, "age", "fare").add_legend()

![image.png](./imgs/15.jpg)

###### PairGrid

In [None]:
#为了图的展示效果，行列上只选了两个字段，实际使用可以选择多个字段
g = sns.PairGrid(titanic[['age', 'fare', 'who']], hue="who", size=5)
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter)
g.add_legend()

![image.png](./imgs/16.jpg)

##### Heatmap热力图

In [None]:
#变量间相关系数热力图
f = titanic[['age', 'fare', 'sibsp']].corr()
sns.heatmap(f, annot=True)

![image.png](./imgs/17.jpg)
图中的数值是皮尔森相关系数，浅颜色表示相关性高。

##    查看缺失情况

### Missingno缺失值可视化

In [None]:
import missingno as msno
#为了展示效果，只选取部分字段，调整了图片和字段大小
msno.matrix(
    df=titanic_df[['sex', 'age', 'fare', 'embarked', 'deck', 'embark_town']],
    figsize=(8, 4),
    fontsize=16)

![image.png](./imgs/18.jpg)

In [None]:
msno.heatmap(df=titanic_df, figsize=(8, 4), fontsize=18)

![image.png](./imgs/19.jpg)

In [None]:
msno.bar(df=titanic_df[['sex', 'age', 'fare', 'embarked',
                        'deck', 'embark_town']], figsize=(8, 6), fontsize=18)

![image.png](./imgs/20.jpg)

## 缺失值处理方式

### 删除

In [None]:
titanic_df.shape
#输出
（891,15）

#如果行数据中空值，按行删除，按列删除配置axis=1
titanic_df_row  = titanic_df.dropna ( axis=0) 
titanic_df_row.shape
#输出
（182,15）

In [None]:
def drop_nan_stat(df, copy=False, axis=0, nan_threshold=0.9):
    '''按行、列的缺失值比例删除大于缺失值阈值的行、列'''
    assert isinstance(df, pd.DataFrame)
    return_df = df.copy() if copy else df
    n_rows, n_cols = return_df.shape

    if axis == 0:
        t = return_df.isnull().sum(axis=0)
        t = pd.DataFrame(t, columns=['NumOfNan'])
        t['PctOFNan'] = t['NumOfNan'] / n_rows
        return_df = return_df.drop(
            labels=t[t.PctOFNan > nan_threshold].index.tolist(), axis=1)
    elif axis == 1:
        t = return_df.isnull().sum(axis=1)
        t = pd.DataFrame(t, columns=['NumOfNan'])
        t['PctOFNan'] = t['NumOfNan'] / n_cols
        print(t)
        return_df = return_df.drop(
            labels=t[t.PctOFNan > nan_threshold].index.tolist(), axis=0)

    return return_df

#删除缺失值比例大于0.5的列
titanic_df_col = drop_nan_stat(df=titanic_df,
                               copy=True,
                               axis=0,
                               nan_threshold=0.5)
msno.bar(df=titanic_df_col, figsize=(8, 4), fontsize=18)

![image.png](./imgs/21.jpg)
可以发现缺失值比例大于0.5的列“deck”已经被删除。

### 填充

#### 替换缺失值

In [None]:
#由于后续的案例也会使用titanic_df，保持titanic_df数据不变，复制一份新数据进行填充操作。
titanic_df_fill=titanic_df.copy()
titanic_df_fill.info()

titanic_df_fill['age'].fillna(titanic_df_fill['age'].median(), inplace=True)
#判断age填充后是否还有缺失值
titanic_df_fill['age'].isnull().any()
#输出
False

#类似的，类别变量'embarked'使用众数填充示例：
titanic_df_fill['embarked'].fillna(titanic_df_fill['embarked'].mode()[0],inplace=True)

#### 拟合缺失值

In [None]:
# 导入sklearn.ensemble.RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# RandomForestRegressor只能处理数值、数据，获取缺失值年龄和数值类型变量
age_df = titanic_df[['age', 'fare', 'parch', 'sibsp', 'pclass']].copy()
print(age_df['age'].isnull().any())
#输出 True

# 按年龄是否缺失，可分为训练数据集和预测数据集
train_df = age_df[age_df.age.notnull()].as_matrix()
predict_df = age_df[age_df.age.isnull()].as_matrix()
# y即目标年龄
y = train_df[:, 0]
# X即特征属性值
X = train_df[:, 1:]

# 训练数据集使用RandomForestRegressor训练模型
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
rf_model.fit(X, y)
# 用训练好的模型预测数据集的年龄进行预测
predict_ages = rf_model.predict(predict_df[:, 1:])

# 预测结果填补原缺失数据
age_df.loc[(age_df.age.isnull()), 'age'] = predict_ages
print(age_df['age'].isnull().any())
#输出
False

#### 虚拟变量

In [None]:
age_df['age'] = titanic_df['age'].copy()

#判断年龄是否缺失，衍生一个新变量age_nan
age_df.loc[(age_df.age.notnull()), 'age_nan'] = "False"
age_df.loc[(age_df.age.isnull()), 'age_nan'] = "True"

#统计新变量age_nan缺失和非缺失的数量，可以与之前的缺失值可视化进行缺失值数据核验
age_df['age_nan'].value_counts()
#输出
False    714
True     177

### 异常值检测

In [None]:
#只选取上述泰坦尼克数据集的3个字段作为案例展示
columns=['pclass','age','fare']
for var in columns:
    titanic_df[var + '_zscore'] = (titanic_df[var] - titanic_df[var].mean()) / titanic_df[var].std()
    z_normal = abs(titanic_df[var + '_zscore']) > 3
    print(var + '中有' + str(z_normal.sum()) + '个异常值')
#输出
pclass中有0个异常值
age中有2个异常值
fare中有20个异常值

In [None]:
for var in columns:
    iqr = titanic_df[var].quantile(0.75) - titanic_df[var].quantile(0.25)
    q_abnormal_L = titanic_df[var] < titanic_df[var].quantile(0.25) - 1.5 * iqr
    q_abnormal_U = titanic_df[var] > titanic_df[var].quantile(0.75) + 1.5 * iqr
    print(var + '中有' + str(q_abnormal_L.sum() + q_abnormal_U.sum()) + '个异常值')
#输出
pclass中有0个异常值
age中有11个异常值
fare中有116个异常值

In [None]:
#导入sklearn.cluster.KMeans
from sklearn.cluster import KMeans

#修改matplotlib配置参数支持中文显示，
plt.rcParams['font.family']='SimHei'

#聚类的类别
k = 3 
#异常点阈值
threshold = 3 
#读取已经填充过空值的数据
data = titanic[['pclass','age','fare']].copy() 
#数据标准化
data_zs = 1.0*(data - data.mean())/data.std() 

#使用聚类模型聚类
model = KMeans(n_clusters = 3, max_iter = 500) 
model.fit(data_zs) 
 
#标准化数据及其类别
r = pd.concat([data_zs, pd.Series(model.labels_, index = data.index)], axis = 1)  
r.columns = list(data.columns) + ['聚类类别'] 

#计算相对距离
norm = []
for i in range(k): 
    norm_tmp = r[['pclass','age','fare']][r['聚类类别'] == i]-model.cluster_centers_[i]
    norm_tmp = norm_tmp.apply(np.linalg.norm, axis = 1) 
    norm.append(norm_tmp/norm_tmp.median()) 
norm = pd.concat(norm)

#正常点，相对距离小于等于异常点阈值
norm[norm <= threshold].plot(style = 'go')
#异常点，相对距离大于异常点阈值
discrete_points = norm[norm > threshold] 
discrete_points.plot(style = 'ro')
    
plt.xlabel('编号')
plt.ylabel('相对距离')
plt.show()

![image.png](./imgs/22.jpg)
获取大于异常点阈值的索引，方便后续处理.

discrete_points = norm[norm>threshold]

discrete_points.index