## 数据分析基本指标
- 均值、中位数、众数、分位数
- 方差、标准差
- 偏态系数
- 峰度系数

## 生成正态分布及相关调用
## 抽样

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('~/mycode/python/data_analyze/data/HR.csv')
df.head(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
5,0.41,0.5,2,153,3,0,1,0,sales,low
6,0.1,0.77,6,247,4,0,1,0,sales,low
7,0.92,0.85,5,259,5,0,1,0,sales,low
8,0.89,1.0,5,224,5,0,1,0,sales,low
9,0.42,0.53,2,142,3,0,1,0,sales,low


In [3]:
#均值
df.mean()

satisfaction_level         0.612850
last_evaluation            0.716102
number_project             3.803054
average_montly_hours     201.050337
time_spend_company         3.498233
Work_accident              0.144610
left                       0.238083
promotion_last_5years      0.021268
dtype: float64

In [4]:
#中位数
df.median()

satisfaction_level         0.64
last_evaluation            0.72
number_project             4.00
average_montly_hours     200.00
time_spend_company         3.00
Work_accident              0.00
left                       0.00
promotion_last_5years      0.00
dtype: float64

In [5]:
#众数
df.mode()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.1,0.55,4.0,135,3.0,0.0,0.0,0.0,sales,low
1,,,,156,,,,,,


In [6]:
#分位数
df.quantile(q=0.25)

satisfaction_level         0.44
last_evaluation            0.56
number_project             3.00
average_montly_hours     156.00
time_spend_company         3.00
Work_accident              0.00
left                       0.00
promotion_last_5years      0.00
Name: 0.25, dtype: float64

In [7]:
#标准差
df.std()

satisfaction_level        0.248630
last_evaluation           0.171169
number_project            1.232592
average_montly_hours     49.943099
time_spend_company        1.460136
Work_accident             0.351719
left                      0.425924
promotion_last_5years     0.144281
dtype: float64

In [8]:
#方差
df.var()

satisfaction_level          0.061817
last_evaluation             0.029299
number_project              1.519284
average_montly_hours     2494.313175
time_spend_company          2.131998
Work_accident               0.123706
left                        0.181411
promotion_last_5years       0.020817
dtype: float64

In [9]:
#求和
df.sum()

satisfaction_level                                                 9191.53
last_evaluation                                                    10740.8
number_project                                                       57042
average_montly_hours                                               3015554
time_spend_company                                                   52470
Work_accident                                                         2169
left                                                                  3571
promotion_last_5years                                                  319
department               salessalessalessalessalessalessalessalessaless...
salary                   lowmediummediumlowlowlowlowlowlowlowlowlowlowl...
dtype: object

In [10]:
#偏态系数
df.skew()

satisfaction_level      -0.476527
last_evaluation         -0.026622
number_project           0.337706
average_montly_hours     0.052842
time_spend_company       1.853319
Work_accident            2.021149
left                     1.230043
promotion_last_5years    6.636968
dtype: float64

In [11]:
#峰度系数
df.kurt()

satisfaction_level       -0.670635
last_evaluation          -1.239040
number_project           -0.495478
average_montly_hours     -1.134982
time_spend_company        4.773211
Work_accident             2.085320
left                     -0.487060
promotion_last_5years    42.054957
dtype: float64

In [12]:
import scipy.stats as ss

In [13]:
#生成正态分布
data = ss.norm()
data.stats(moments='mvsk')#均值、方差、偏态系数、峰度系数

(array(0.), array(1.), array(0.), array(0.))

In [14]:
#pdf 给出横坐标的值，计算纵坐标的值，在正太分布中为0.39
ss.norm.pdf(0.0)

0.3989422804014327

In [15]:
#ppf 从负无穷大积分到1.28，值等于0.9
ss.norm.ppf(0.9)

1.2815515655446004

In [16]:
#cdf 从负无穷积分到给定值，计算概率
ss.norm.cdf(2)

0.9772498680518208

In [17]:
#计算正态分布冲-2积分到2的概率
ss.norm.cdf(2)-ss.norm.cdf(-2)

0.9544997361036416

In [18]:
#给出满足正态分布的数值
ss.norm.rvs(size=100)

array([-1.30342735e+00, -1.48402883e-01, -2.96761207e-02, -1.13830417e+00,
       -6.92102443e-01, -8.04472680e-01, -1.21054439e+00, -1.09872645e-01,
       -1.03607589e+00, -2.53805105e-01, -4.25949857e-01, -1.89337613e-01,
       -1.39783055e+00,  5.74593376e-01, -8.91001140e-01,  1.95677728e+00,
       -1.56518416e-01,  5.32116743e-01,  3.12342802e-01,  2.67789764e+00,
       -2.06395156e-01,  2.19744956e-01,  2.98540444e-01,  1.47245806e+00,
       -8.41060842e-02,  1.37088932e+00,  8.23839298e-01, -1.43983407e+00,
       -4.87517938e-01, -5.38710452e-01, -5.45697741e-02, -3.73661891e-01,
        2.14525971e+00, -7.14336511e-01, -4.95644976e-01, -1.58749321e+00,
        7.51082068e-01, -7.13406417e-01,  1.32350696e-01, -4.27594247e-01,
       -1.62437072e+00,  8.41785844e-01,  2.41644502e-01,  8.41964036e-02,
        1.00385706e+00,  9.55466252e-01,  2.70956307e+00, -3.20840329e-01,
        4.80946161e-02,  1.59672958e+00, -2.78573296e-01,  2.60933003e-01,
       -2.92909768e+00,  

In [19]:
#抽样
df.sample(n=10)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
4391,0.72,0.94,4,235,3,0,0,0,technical,medium
13196,0.59,0.65,5,265,3,0,0,0,IT,medium
5715,0.77,0.87,5,257,2,0,0,0,product_mng,medium
11490,0.64,0.87,4,157,7,0,0,0,marketing,low
13923,0.75,0.66,5,234,2,0,0,0,management,high
12549,0.41,0.47,2,138,3,0,1,0,sales,low
1168,0.79,0.84,5,245,5,0,1,0,sales,medium
5525,0.24,0.5,4,232,3,0,0,1,accounting,high
14147,0.53,0.8,2,225,7,1,0,1,management,high
10346,0.93,0.61,4,205,3,0,0,0,technical,low


In [20]:
#比例抽样
df.sample(frac=0.001)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
10313,0.53,0.85,5,268,3,1,0,0,sales,medium
10750,0.41,0.63,2,145,2,0,0,0,product_mng,low
5411,0.2,0.51,2,163,2,0,0,0,product_mng,low
518,0.44,0.54,2,151,3,0,1,0,management,low
7419,0.77,0.57,4,238,3,0,0,0,sales,high
11475,0.67,0.65,3,265,3,0,0,0,product_mng,medium
14282,0.09,0.77,5,275,4,0,1,0,product_mng,medium
5585,0.86,0.77,5,230,2,0,0,0,sales,medium
3086,0.57,0.56,4,113,3,0,0,0,hr,low
6402,0.8,0.85,4,139,2,0,0,0,product_mng,medium
