## 单属性分析
- 异常值分析
   - 连续异常值 k*(q3-q1) k取值1.5~3，如果能够覆盖上下界，则没有离群值，否则有离群值需要处理
   - 离散异常值
   - 知识异常值
- 对比分析
   - 比什么
      - 绝对数比较
      - 相对数比较
   - 怎么比
- 结构分析
   - 静态分析 直接分析总体的组成
   - 动态分析 以时间为轴，分析结构的变化趋势
- 分布分析
   - 直接获得概率分布
   - 是不是正态分布
   - 极大似然 相似程度的衡量，最大的极大似然决定分布

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('~/mycode/python/data_analyze/data/HR.csv')
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


## 分析satisfaction_level

In [4]:
#异常值分析
sl = df['satisfaction_level']
df[df['satisfaction_level'].isnull()]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
14,,0.56,2,137,3,0,1,0,sales,low


In [5]:
#均值
sl.mean()

0.6128503800506743

In [6]:
sl.std()

0.24863036787331577

In [7]:
sl.max()

1.0

In [8]:
sl.min()

0.09

In [9]:
sl.median()

0.64

In [10]:
sl.mode()

0    0.1
dtype: float64

In [11]:
sl.quantile(q=0.25)

0.44

In [12]:
sl.quantile(q=0.75)

0.82

In [13]:
rs = (0.82-0.44)*1.5
0.44-rs

-0.12999999999999995

In [14]:
0.82+rs

1.39

In [15]:
sl.skew() #均值偏小，大部分都会在均值之上，说明员工满意度还是可以的

-0.4765270437780743

In [16]:
sl.kurt() #峰度系数还行，比较平缓

-0.6706347247382953

In [17]:
np.histogram(sl.values,bins=np.arange(0.0,1.1,0.1))

(array([ 195, 1214,  532,  973, 1668, 2146, 1972, 2074, 2220, 2004]),
 array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]))

## 分析last_evaluation

In [18]:
le = df['last_evaluation']
le[le.isnull()]

Series([], Name: last_evaluation, dtype: float64)

In [19]:
le.describe()

count    14999.000000
mean         0.716102
std          0.171169
min          0.360000
25%          0.560000
50%          0.720000
75%          0.870000
max          1.000000
Name: last_evaluation, dtype: float64

In [20]:
le.skew()

-0.02662174986376086

In [21]:
le.kurt()

-1.2390402819304127

In [22]:
q_low = le.quantile(q=0.25)
q_high = le.quantile(q=0.75)
q_interval = q_high - q_low
k = 1.5
le = le[le<q_high+k*q_interval][le>q_low-k*q_interval]
le.count()

14999

In [23]:
np.histogram(le.values,bins=np.arange(0.0,1.1,0.1))

(array([   0,    0,    0,  179, 1389, 3395, 2234, 2062, 2752, 2988]),
 array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]))

## number_project

In [24]:
numProj = df['number_project']
numProj[numProj.isnull()]

Series([], Name: number_project, dtype: int64)

In [25]:
numProj.mean()

3.80305353690246

In [26]:
numProj.median()

4.0

In [27]:
numProj.mode()

0    4
dtype: int64

In [28]:
numProj.min()

2

In [29]:
numProj.max()

7

In [30]:
numProj.std()

1.2325923553183857

In [31]:
numProj.skew()

0.3377056123598222

In [32]:
numProj.kurt()

-0.4954779519008947

In [33]:
numProj.value_counts()

4    4365
3    4055
5    2761
2    2388
6    1174
7     256
Name: number_project, dtype: int64

In [34]:
numProj.value_counts(normalize=True)

4    0.291019
3    0.270351
5    0.184079
2    0.159211
6    0.078272
7    0.017068
Name: number_project, dtype: float64

In [35]:
numProj.value_counts(normalize=True).sort_index()

2    0.159211
3    0.270351
4    0.291019
5    0.184079
6    0.078272
7    0.017068
Name: number_project, dtype: float64

## average_montly_hours

In [36]:
amh = df['average_montly_hours']
amh.describe()

count    14999.000000
mean       201.050337
std         49.943099
min         96.000000
25%        156.000000
50%        200.000000
75%        245.000000
max        310.000000
Name: average_montly_hours, dtype: float64

In [37]:
amh.skew()

0.0528419894163242

In [38]:
amh.kurt()

-1.1349815681924558

In [39]:
#分布
np.histogram(amh.values,bins=10)

(array([ 367, 1240, 2733, 1722, 1628, 1712, 1906, 2240, 1127,  324]),
 array([ 96. , 117.4, 138.8, 160.2, 181.6, 203. , 224.4, 245.8, 267.2,
        288.6, 310. ]))

In [40]:
np.histogram(amh.values,bins=np.arange(amh.min(),amh.max()+10,10))

(array([ 168,  171,  147,  807, 1153, 1234, 1072,  824,  818,  758,  751,
         738,  856,  824,  987, 1002, 1045,  935,  299,  193,  131,   86]),
 array([ 96, 106, 116, 126, 136, 146, 156, 166, 176, 186, 196, 206, 216,
        226, 236, 246, 256, 266, 276, 286, 296, 306, 316]))

In [41]:
amh.value_counts(bins=np.arange(amh.min(),amh.max()+10,10)).sort_index()

(95.999, 106.0]     187
(106.0, 116.0]      162
(116.0, 126.0]      162
(126.0, 136.0]      886
(136.0, 146.0]     1159
(146.0, 156.0]     1277
(156.0, 166.0]      992
(166.0, 176.0]      832
(176.0, 186.0]      813
(186.0, 196.0]      761
(196.0, 206.0]      755
(206.0, 216.0]      731
(216.0, 226.0]      873
(226.0, 236.0]      814
(236.0, 246.0]     1006
(246.0, 256.0]      987
(256.0, 266.0]     1063
(266.0, 276.0]      860
(276.0, 286.0]      319
(286.0, 296.0]      164
(296.0, 306.0]      128
(306.0, 316.0]       68
Name: average_montly_hours, dtype: int64

## time_spend_company

In [42]:
tsc = df['time_spend_company']
tsc.describe()

count    14999.000000
mean         3.498233
std          1.460136
min          2.000000
25%          3.000000
50%          3.000000
75%          4.000000
max         10.000000
Name: time_spend_company, dtype: float64

In [43]:
tsc.skew()

1.8533190474339154

In [44]:
tsc.kurt()

4.773210725010069

In [45]:
tsc.value_counts().sort_index()

2     3244
3     6443
4     2557
5     1473
6      718
7      188
8      162
10     214
Name: time_spend_company, dtype: int64

## Work_accident

In [46]:
wa = df['Work_accident']
wa.describe()

count    14999.000000
mean         0.144610
std          0.351719
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: Work_accident, dtype: float64

In [47]:
wa.value_counts()

0    12830
1     2169
Name: Work_accident, dtype: int64

## left

In [48]:
left = df['left']
left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

## promotion_last_5years

In [49]:
pl = df['promotion_last_5years']
pl.value_counts()

0    14680
1      319
Name: promotion_last_5years, dtype: int64

## salary

In [50]:
salary = df['salary']
salary.value_counts()

#演示在数组中只保留high的行
# salary.where(salary=='high').dropna()

low       7316
medium    6446
high      1237
Name: salary, dtype: int64

## department

In [51]:
dpt = df['department']
dpt.describe()

count     14999
unique       10
top       sales
freq       4140
Name: department, dtype: object

In [52]:
dpt.value_counts(normalize=True)

sales          0.276018
technical      0.181345
support        0.148610
IT             0.081805
product_mng    0.060137
marketing      0.057204
RandD          0.052470
accounting     0.051137
hr             0.049270
management     0.042003
Name: department, dtype: float64