## 数据处理示实例


In [102]:
import pandas as pd
import numpy as np
tsdata = pd.read_csv("titanic_train.csv")  # 载入数据 泰坦尼克船员获获救预测数据集
tsdata.head()
# 数据即中的内容主要包括
#passengerid:乘客id survived:是否幸存 pclass:舱位级别 name:名字 
#sex:性别 age:年龄 sibsp:同亲数量 parch:父母数量 ticket:船票编码 
#fare:船票价格 cabin:船舱位区 embarked:登船地点


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 数据查看

In [47]:
age = tsdata["Age"]  # 前10记录中的年龄的大小 发现存在缺失值
print(age.loc[0:10])


0     22.0
1     38.0
2     26.0
3     35.0
4     35.0
5      NaN
6     54.0
7      2.0
8     27.0
9     14.0
10     4.0
Name: Age, dtype: float64


### 缺失值处理

In [86]:
age_is_null = pd.isnull(age)  # 布尔判断 缺失值标记为true
# age_is_rec = pd.istrue(age) # 没有istrue 判断
print(age_is_null)
age_null_true_data = age[age_is_null]  # 使用返回的布尔判断作为索引,索引出缺失值的记录
print(age_null_true_data)
age_null_true_data_count = len(age_null_true_data)
print(age_null_true_data_count)  # 统计空记录值的记录条数


0      False
1      False
2      False
3      False
4      False
5       True
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17      True
18     False
19      True
20     False
21     False
22     False
23     False
24     False
25     False
26      True
27     False
28      True
29      True
       ...  
861    False
862    False
863     True
864    False
865    False
866    False
867    False
868     True
869    False
870    False
871    False
872    False
873    False
874    False
875    False
876    False
877    False
878     True
879    False
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888     True
889    False
890    False
Name: Age, Length: 891, dtype: bool
5     NaN
17    NaN
19    NaN
26    NaN
28    NaN
29    NaN
31    NaN
32    NaN
36    NaN
42    NaN
45    NaN
46    NaN
47    NaN
48    NaN
55    NaN
64    NaN
65    NaN
7

### 数据统计 (不处理缺失值的情况下)

In [64]:
# 统计平均年龄
mean_age = sum(tsdata["Age"]) / len(tsdata["Age"]) # 不处理缺失值统计均年龄
print(mean_age)  # nan 无


nan


### 数据统计 处理缺失值
#### 可以通过 众数, 均值, 中位数 等统计数值对缺失值进行填充

In [None]:
# 处理缺失值
#print(age_is_null)
good_ages = tsdata["Age"][age_is_null == False]  # 匹配出无缺失值的年龄
#print(good_ages)
correct_mean_age = sum(good_ages) / len(good_ages)  # 无缺失值的均年龄统计
print("使用公式计算均值",correct_mean_age)

correct_mean_age_2 = tsdata["Age"].mean()
print("方法2函数直接计算均值",correct_mean_age_2)


使用 ,dropna .fillna 剔除 填充null值

In [120]:
print("原数据",tsdata.head())
drop_na_columns = tsdata.dropna(axis=1) # 参数 axis:1 行记录的null剔除
print("剔除null了", drop_na_columns)

drop_na_row = tsdata.dropna(axis=0, subset=["Age", "Sex"])  # axis:0 指定属性列
print(drop_na_row)

# 使用fillna 填充null
# fill_na_cols = tsdata.fillna()

原数据    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S 

### 手动计算每个等级的仓位的均价

In [99]:
# 每个等级的仓位的均价
passenger_class = [1,2,3]  # 构建等级索引列表,用于不同等级的记录的匹配
fares_by_class = {}  # 存储结果的字典
print(type(fares_by_class))
for this_class in passenger_class: # 对每个等级的记录进行遍历求均价
    pclass_rows = tsdata[tsdata["Pclass"] == this_class]  # 筛选出该等级的记录
    pclass_fares = pclass_rows["Fare"] #提取该等级的仓位价格
    fare_for_class = pclass_fares.mean()  # 计算该等级的仓位价格均值
    fares_by_class[this_class] = fare_for_class # 计算的均值存放在结果的列表中
    print(fare_for_class)
print(fares_by_class)


<class 'dict'>
84.15468749999992
20.66218315217391
13.675550101832997
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}


.pivot_table 计算数据透视表
.pivot(max, 统计变量, 统计方式)
.pivot(index=索引属性, 统计变量, 统计方式)

In [108]:
# 使用 v.pivot_table 统计不同等级的仓位的幸存的概率
passenger_survival_proba = tsdata.pivot_table(index="Pclass", values= "Survived", aggfunc=np.mean)
# 参数: 
print(passenger_survival_proba)


        Survived
Pclass          
1       0.629630
2       0.472826
3       0.242363


In [109]:
# 使用 v.pivot_table 统计不同等级的仓位平均年龄 默认的统计aggfunc 是求均值
passenger_class_age_mean = tsdata.pivot_table(index="Pclass", values="Age")
print(passenger_class_age_mean) 


              Age
Pclass           
1       38.233441
2       29.877630
3       25.140620


### 使用pivot_table 进行多个变量属性的数据统计

In [110]:
# 统计登船港口的状态
port_stats = tsdata.pivot_table(index="Pclass", values=["Fare", "Survived"], aggfunc=np.sum)
print("港口状态", port_stats)


港口状态               Fare  Survived
Pclass                      
1       18177.4125       136
2        3801.8417        87
3        6714.6951       119


### 对矩阵中的数据进行排序

In [127]:
new_ts_survival = tsdata.sort_values("Age", ascending=False) # age降序排序
print(new_ts_survival[0:10])  
new_ts_index =new_ts_survival.reset_index(drop=True)  # 对age排序后的矩阵,再次进行索引排序, drop:True 重新排序
print(new_ts_index[0:10])


     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   

      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
630  male  80.0      0      0       27042  30.0000   A23        S  
851  male  74.0      0     

### 自定义函数进行数据整合统计 将不同的统计函数进行组合
### 执行自定义的函数使用 .apply(自定义函数名) 

In [128]:
def hundreth_row(column):  # 第一百行数据
    hundreth_item = column.loc[99] # 取99行数据
    return hundreth_item

hundreth_row = tsdata.apply(hundreth_row)  # 更新获取的行数据
print(hundreth_row) 


PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object


### 统计属性null行数

In [132]:
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)
column_null_count = tsdata.apply(not_null_count)
print(column_null_count)



PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [136]:
null = pd.isnull(tsdata["Age"])
print(null)

0      False
1      False
2      False
3      False
4      False
5       True
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17      True
18     False
19      True
20     False
21     False
22     False
23     False
24     False
25     False
26      True
27     False
28      True
29      True
       ...  
861    False
862    False
863     True
864    False
865    False
866    False
867    False
868     True
869    False
870    False
871    False
872    False
873    False
874    False
875    False
876    False
877    False
878     True
879    False
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888     True
889    False
890    False
Name: Age, Length: 891, dtype: bool


TypeError: ("'Series' object is not callable", 'occurred at index PassengerId')

对数据处理结果进行矩阵内的属性的显示

In [141]:
# 对年龄进行判断 划归区间
def generate_age_lable(row):  # row 实际上代表矩阵
    age = row["Age"]
    if pd.isnull(age):
        return "unknow"
    elif age < 18:
        return "minor"
    else:return "adult"
age_lables = tsdata.apply(generate_age_lable, axis=1)
print(age_lables)


0       adult
1       adult
2       adult
3       adult
4       adult
5      unknow
6       adult
7       minor
8       adult
9       minor
10      minor
11      adult
12      adult
13      adult
14      minor
15      adult
16      minor
17     unknow
18      adult
19     unknow
20      adult
21      adult
22      minor
23      adult
24      minor
25      adult
26     unknow
27      adult
28     unknow
29     unknow
        ...  
861     adult
862     adult
863    unknow
864     adult
865     adult
866     adult
867     adult
868    unknow
869     minor
870     adult
871     adult
872     adult
873     adult
874     adult
875     minor
876     adult
877     adult
878    unknow
879     adult
880     adult
881     adult
882     adult
883     adult
884     adult
885     adult
886     adult
887     adult
888    unknow
889     adult
890     adult
Length: 891, dtype: object


### 对年龄的标签分类进行幸存率的统计

In [142]:
tsdata["age_lables"] = age_lables # 将新分类的年龄的标签加入到矩阵的记录属性中
age_group_survived = tsdata.pivot_table(index="age_lables", values="Survived", aggfunc=np.mean)
print(age_group_survived)

            Survived
age_lables          
adult       0.381032
minor       0.539823
unknow      0.293785
