In [3]:
# recorded by hao (2018/03/09)
# rock3.hao@gmail.com
# qinlab.BNU

# training of pandas 03

In [4]:
import pandas as pd
import numpy as np
ttnc_svivl = pd.read_csv("./data/data_ttnk_train.csv")
ttnc_svivl.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
# we can use the pandas.isnull() function which takes a pandas series and returns a series
#of True and False values
age = ttnc_svivl["Age"]
print(age.loc[5:10])

age_is_null = pd.isnull(age)
print(age_is_null.loc[5:10])

# print age_is_null
age_null_true = age[age_is_null]
#print(age_null_true)

# print age_null_true
age_null_count = len(age_null_true)
print(age_null_count)

5      NaN
6     54.0
7      2.0
8     27.0
9     14.0
10     4.0
Name: Age, dtype: float64
5      True
6     False
7     False
8     False
9     False
10    False
Name: Age, dtype: bool
177


In [6]:
# The result of this is that mean_age would be nan. This is because any calculations we do
#with a null value also result in a null value
mean_age = sum(ttnc_svivl["Age"]) / len(ttnc_svivl["Age"])
print(mean_age)

nan


In [7]:
# we have to filter out the missing values before we calculate the mean.
good_ages = ttnc_svivl["Age"][age_is_null == False]
# print good_ages
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age)

29.69911764705882


In [8]:
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = ttnc_svivl["Age"].mean()
print(correct_mean_age)

29.69911764705882


In [9]:
# mean fare for each class
passenger_classes = [1, 2, 3]
fares_by_class = {}
for iclass in passenger_classes:
    pclass_rows = ttnc_svivl[ttnc_svivl["Pclass"] == iclass]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[iclass] = fare_for_class
print(fares_by_class)

{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}


In [10]:
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = ttnc_svivl.pivot_table(index="Pclass", values="Age", aggfunc=np.mean)
print(passenger_survival)

              Age
Pclass           
1       38.233441
2       29.877630
3       25.140620


In [11]:
passenger_survival = ttnc_svivl.pivot_table(index="Pclass", values=["Fare","Survived","Age"])
print(passenger_survival)

              Age       Fare  Survived
Pclass                                
1       38.233441  84.154687  0.629630
2       29.877630  20.662183  0.472826
3       25.140620  13.675550  0.242363


In [12]:
port_stats = ttnc_svivl.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)

                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217


In [13]:
# specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = ttnc_svivl.dropna(axis='columns')
#print(drop_na_columns)
new_ttnc_svivl = ttnc_svivl.dropna(axis=0, subset=["Age", "Sex"])
#print(new_ttnc_svivl)

In [14]:
row_index_83_age = ttnc_svivl.loc[83,"Age"]
row_index_766_pclass = ttnc_svivl.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_766_pclass)

28.0
1


In [15]:
new_ttnc_svivl = ttnc_svivl.sort_values("Age",ascending=False)
#print(new_ttnc_svivl[0:10])
#print('-----------------------------------------------------------------')

ttnc_reindexed = new_ttnc_svivl.reset_index(drop=True) # 最左列序号重新生成，从0开始
#print(ttnc_reindexed.iloc[0:10])

In [16]:
# This function returns the hundredth item from a series
def hundredth_row(hao1ei):
    # Extract the hundredth item
    hundredth_item = hao1ei.iloc[99]
    return hundredth_item

# Return the hundredth item from each column
hundredth_row = ttnc_svivl.apply(hundredth_row) # 自定义一个函数，并应用
print(hundredth_row)

PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object


In [17]:
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)

column_null_count = ttnc_svivl.apply(not_null_count)
print(column_null_count)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [18]:
# By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(haolei):
    pclass = haolei['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"

classes = ttnc_svivl.apply(which_class, axis=1)
#print(classes)

In [19]:
def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False

minors = ttnc_svivl.apply(is_minor, axis=1)
#print(minors)

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = ttnc_svivl.apply(generate_age_label, axis=1)
# print(age_labels)

In [20]:
ttnc_svivl['age_labels'] = age_labels
age_group_survival = ttnc_svivl.pivot_table(index="age_labels", values="Survived")
print(age_group_survival)

            Survived
age_labels          
adult       0.381032
minor       0.539823
unknown     0.293785
