# Advanced `pandas`

In [None]:
%pylab inline
plt.style.use("bmh")

In [80]:
import numpy as np
import pandas as pd
import re

## Split-apply-combine

In [3]:
titanic_train = pd.read_csv("train.csv", index_col="PassengerId")
titanic_test = pd.read_csv("test.csv", index_col="PassengerId")
titanic = pd.concat([titanic_train, titanic_test], sort=False)

In [4]:
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Scalar output

In [5]:
class_groups = titanic.groupby("Pclass")

In [6]:
class_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000240F45B1EB8>

In [7]:
class_groups.mean()

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.62963,39.15993,0.436533,0.365325,87.508992
2,0.472826,29.506705,0.393502,0.368231,21.179196
3,0.242363,24.816367,0.568406,0.400564,13.302889


In [8]:
age_groups = titanic.Parch.groupby((5 + 10*(titanic.Age//10)))

In [9]:
age_groups

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000240F45FB128>

In [10]:
age_groups.mean() # Note index name

Age
5.0     1.390244
15.0    0.482517
25.0    0.223837
35.0    0.314655
45.0    0.503704
55.0    0.300000
65.0    0.531250
75.0    0.142857
85.0    0.000000
Name: Parch, dtype: float64

In [11]:
age_groups_npy = titanic.Parch.groupby((5 + 10*(titanic.Age//10)).values)

In [12]:
age_groups_npy.mean()

5.0     1.390244
15.0    0.482517
25.0    0.223837
35.0    0.314655
45.0    0.503704
55.0    0.300000
65.0    0.531250
75.0    0.142857
85.0    0.000000
Name: Parch, dtype: float64

In [13]:
age_groups_multi = titanic.Parch.groupby([(5 + 10*(titanic.Age//10)), titanic.Pclass])

In [14]:
age_groups_multi = titanic.Parch.groupby([titanic.Pclass, (5 + 10*(titanic.Age//10))])

In [15]:
age_groups_multi

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000240F463EF60>

In [16]:
age_groups_multi.mean()

Pclass  Age 
1       5.0     2.000000
        15.0    0.863636
        25.0    0.480769
        35.0    0.250000
        45.0    0.209677
        55.0    0.369565
        65.0    0.761905
        75.0    0.250000
        85.0    0.000000
2       5.0     1.409091
        15.0    0.310345
        25.0    0.288889
        35.0    0.265625
        45.0    0.451613
        55.0    0.235294
        65.0    0.142857
        75.0    0.000000
3       5.0     1.339286
        15.0    0.445652
        25.0    0.128713
        35.0    0.395833
        45.0    0.976190
        55.0    0.000000
        65.0    0.000000
        75.0    0.000000
Name: Parch, dtype: float64

In [17]:
age_groups_multi.mean().unstack()

Age,5.0,15.0,25.0,35.0,45.0,55.0,65.0,75.0,85.0
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2.0,0.863636,0.480769,0.25,0.209677,0.369565,0.761905,0.25,0.0
2,1.409091,0.310345,0.288889,0.265625,0.451613,0.235294,0.142857,0.0,
3,1.339286,0.445652,0.128713,0.395833,0.97619,0.0,0.0,0.0,


In [None]:
age_groups_mixed = titanic.groupby([(5 + 10*(titanic.Age//10)), "Pclass"])

In [None]:
titanic

In [None]:
5 + 10*(titanic.Age//10)

In [None]:
age_groups_mixed.Parch.mean()#.unstack()

In [None]:
print(age_groups_mixed.Parch.mean())

# Series output

In [42]:
class_groups = titanic.groupby("Pclass") # Nothing is calculated yet

In [53]:
n = titanic.groupby(["Pclass", "Sex"]).Age.mean()

In [56]:
dict(n)

{(1, 'female'): 37.037593984962406,
 (1, 'male'): 41.02927152317881,
 (2, 'female'): 27.499223300970876,
 (2, 'male'): 30.815379746835443,
 (3, 'female'): 22.185328947368422,
 (3, 'male'): 25.962263610315187}

In [77]:
a=titanic[(titanic.Sex == 'male') & (titanic.Pclass == 3)].fillna(0)


In [78]:
a

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,0,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,0,S
6,0.0,3,"Moran, Mr. James",male,0.0,0,0,330877,8.4583,0,Q
8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,0,S
13,0.0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.0500,0,S
14,0.0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.2750,0,S
17,0.0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.1250,0,Q
27,0.0,3,"Emir, Mr. Farred Chehab",male,0.0,0,0,2631,7.2250,0,C
30,0.0,3,"Todoroff, Mr. Lalio",male,0.0,0,0,349216,7.8958,0,S
37,1.0,3,"Mamee, Mr. Hanna",male,0.0,0,0,2677,7.2292,0,C


In [72]:
titanic[(titanic.Sex == 'male') & (titanic.Pclass == 3)]

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
13,0.0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.0500,,S
14,0.0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.2750,,S
17,0.0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.1250,,Q
27,0.0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
30,0.0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S
37,1.0,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C


In [43]:
titanic.Fare.describe()

count    1308.000000
mean       33.295479
std        51.758668
min         0.000000
25%         7.895800
50%        14.454200
75%        31.275000
max       512.329200
Name: Fare, dtype: float64

In [44]:
class_groups.Fare.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,323.0,87.508992,80.447178,0.0,30.6958,60.0,107.6625,512.3292
2,277.0,21.179196,13.607122,0.0,13.0,15.0458,26.0,73.5
3,708.0,13.302889,11.494358,0.0,7.75,8.05,15.2458,69.55


In [45]:
class_groups.apply(lambda x: x.Fare.describe()) # Note column index name

Fare,count,mean,std,min,25%,50%,75%,max
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,323.0,87.508992,80.447178,0.0,30.6958,60.0,107.6625,512.3292
2,277.0,21.179196,13.607122,0.0,13.0,15.0458,26.0,73.5
3,708.0,13.302889,11.494358,0.0,7.75,8.05,15.2458,69.55


In [22]:
titanic.groupby("Sex").Parch.mean()

Sex
female    0.633047
male      0.247924
Name: Parch, dtype: float64

In [23]:
class_groups.apply(lambda x: x.groupby("Sex").Parch.mean()) # Note column index name

Sex,female,male
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.472222,0.27933
2,0.650943,0.192982
3,0.731481,0.255578


In [24]:
class_groups.apply(lambda x: x[x.Parch==0].groupby("Sex").size())

Sex,female,male
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,97,145
2,61,145
3,135,419


In [25]:
titanic[titanic.Parch!=0].groupby(["Sex", "Embarked"]).size()

Sex     Embarked
female  C            42
        Q             3
        S           128
male    C            32
        Q             6
        S            96
dtype: int64

In [26]:
class_groups.apply(lambda x: x[x.Parch!=0].groupby(["Sex", "Embarked"]).size())

Pclass  Sex     Embarked
1       female  C           21
                S           26
        male    C           18
                S           16
2       female  C            4
                S           41
        male    C            3
                S           23
3       female  C           17
                Q            3
                S           61
        male    C           11
                Q            6
                S           57
dtype: int64

In [27]:
result = (class_groups
          .apply(lambda x: x[x.Parch!=0].groupby(["Sex", "Embarked"]).size())
          .unstack([1,2]))

In [28]:
result

Sex,female,female,male,male,female,male
Embarked,C,S,C,S,Q,Q
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,21.0,26.0,18.0,16.0,,
2,4.0,41.0,3.0,23.0,,
3,17.0,61.0,11.0,57.0,3.0,6.0


In [29]:
result.loc[:, ("female", "C")]

Pclass
1    21.0
2     4.0
3    17.0
Name: (female, C), dtype: float64

In [30]:
for gi, group in class_groups:
    print(group)

             Survived  Pclass  \
PassengerId                     
2                 1.0       1   
4                 1.0       1   
7                 0.0       1   
12                1.0       1   
24                1.0       1   
28                0.0       1   
31                0.0       1   
32                1.0       1   
35                0.0       1   
36                0.0       1   
53                1.0       1   
55                0.0       1   
56                1.0       1   
62                1.0       1   
63                0.0       1   
65                0.0       1   
84                0.0       1   
89                1.0       1   
93                0.0       1   
97                0.0       1   
98                1.0       1   
103               0.0       1   
111               0.0       1   
119               0.0       1   
125               0.0       1   
137               1.0       1   
138               0.0       1   
140               0.0       1   
152       

In [31]:
result.loc[:, [("female", "C"), ("female", "S")]]

Sex,female,female
Embarked,C,S
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2
1,21.0,26.0
2,4.0,41.0
3,17.0,61.0


In [32]:
result.iloc[:, [0, 1]]

Sex,female,female
Embarked,C,S
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2
1,21.0,26.0
2,4.0,41.0
3,17.0,61.0


# DataFrame output

In [33]:
titanic[["SibSp", "Parch"]].head()

Unnamed: 0_level_0,SibSp,Parch
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,1,0
3,0,0
4,1,0
5,0,0


In [34]:
(titanic[titanic.Parch!=0].groupby(["Sex", "Embarked"])[["SibSp", "Parch"]]
        .mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,SibSp,Parch
Sex,Embarked,Unnamed: 2_level_1,Unnamed: 3_level_1
female,C,0.619048,1.380952
female,Q,0.333333,2.666667
female,S,1.304688,1.789062
male,C,0.5625,1.3125
male,Q,3.5,1.0
male,S,1.833333,1.677083


In [35]:
(class_groups
 .apply(lambda x: x[x.Parch!=0].groupby(["Sex", "Embarked"])[["SibSp", "Parch"]]
        .mean()))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SibSp,Parch
Pclass,Sex,Embarked,Unnamed: 3_level_1,Unnamed: 4_level_1
1,female,C,0.52381,1.285714
1,female,S,0.807692,1.576923
1,male,C,0.444444,1.388889
1,male,S,0.875,1.5625
2,female,C,1.0,1.75
2,female,S,0.682927,1.512195
2,male,C,0.666667,1.666667
2,male,S,0.869565,1.217391
3,female,C,0.647059,1.411765
3,female,Q,0.333333,2.666667


## Mixing group keys

In [36]:
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [40]:
?titanic.set_index

In [41]:
titanic_idx = titanic.reset_index().set_index((5 + 10*(titanic.Age//10)))

In [39]:
titanic_idx.head()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
25.0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
35.0,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
25.0,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
35.0,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
35.0,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
titanic_idx.groupby([pd.Grouper(level=0), "Pclass"]).Parch.mean()

In [None]:
titanic_idx.groupby([titanic_idx.index, "Pclass"]).Parch.mean()

# pd.Grouper

In [None]:
class_emb_groups = titanic.groupby([pd.Grouper("Embarked"), "Pclass"])

In [None]:
class_emb_groups.size()

# Join operations

In [None]:
a = pd.DataFrame(np.arange(8).reshape((4,2)),
                 columns=["a", "b"],
                 index=["a", "b", "a", "b"])
b = pd.DataFrame(10 + np.arange(4).reshape((4,-1)),
                 columns=["d"],
                 index=["d", "b", "c", "b"])

In [None]:
print(a)
print(b)

In [None]:
a.join(b) # default is left join

In [None]:
a.join(b, how="inner")

In [None]:
print(a)
print(b)

In [None]:
a.join(b, how="right")

In [None]:
a.join(b, how="outer", on="")

In [None]:
c = pd.DataFrame(np.arange(8).reshape((4,2)),
                 columns=["a", "b"],
                 index=pd.MultiIndex.from_tuples([("a", "A"), ("b", "E"), ("a", "Y"), ("b", "R")], names=("lower", "upper")))

In [None]:
c

In [None]:
a.join?

In [None]:
c.join(a[a.a<10], on="lower", rsuffix="_right")

In [82]:
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [83]:
titanic.replace(re.compile(r'\s+\(.*\)'), '')

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1.0,1,"Cumings, Mrs. John Bradley",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath",female,35.0,1,0,113803,53.1000,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
9,1.0,3,"Johnson, Mrs. Oscar W",female,27.0,0,2,347742,11.1333,,S
10,1.0,2,"Nasser, Mrs. Nicholas",female,14.0,1,0,237736,30.0708,,C


In [89]:
family_names = (titanic
                .replace(re.compile(r'\s+\(.*\)'), '')
                .replace(re.compile("Mrs."), "Mr."))[["Name", "Sex"]]

In [90]:
titanic.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [91]:
family_names.head()

Unnamed: 0_level_0,Name,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"Braund, Mr. Owen Harris",male
2,"Cumings, Mr. John Bradley",female
3,"Heikkinen, Miss. Laina",female
4,"Futrelle, Mr. Jacques Heath",female
5,"Allen, Mr. William Henry",male


In [92]:
family_names = family_names[(family_names.Sex=="female") & family_names.Name.str.contains("Mr.")]

In [93]:
family_names.head()

Unnamed: 0_level_0,Name,Sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
2,"Cumings, Mr. John Bradley",female
4,"Futrelle, Mr. Jacques Heath",female
9,"Johnson, Mr. Oscar W",female
10,"Nasser, Mr. Nicholas",female
16,"Hewlett, Mr.",female


In [94]:
family_names.reset_index().set_index("Name")#["PassengerId"]

Unnamed: 0_level_0,PassengerId,Sex
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Cumings, Mr. John Bradley",2,female
"Futrelle, Mr. Jacques Heath",4,female
"Johnson, Mr. Oscar W",9,female
"Nasser, Mr. Nicholas",10,female
"Hewlett, Mr.",16,female
"Vander Planke, Mr. Julius",19,female
"Masselmani, Mr. Fatima",20,female
"Asplund, Mr. Carl Oscar",26,female
"Spencer, Mr. William Augustus",32,female
"Ahlin, Mr. Johan",41,female


In [95]:
titanic = (titanic.join(family_names
                        .reset_index()
                        .set_index("Name")["PassengerId"],
                        on="Name", rsuffix="_Spouse"))

In [96]:
titanic.head(25)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,PassengerId
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,
6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,
7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,
8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,
9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,
10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,


In [97]:
titanic.rename({"PassengerId":"PassengerId_Spouse"}, axis=1, inplace=True)

In [98]:
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,PassengerId_Spouse
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,


In [None]:
?titanic.join

In [99]:
titanic = titanic.join(titanic[["Name", "Age"]],
                       on="PassengerId_Spouse", rsuffix="_Spouse")

In [100]:
titanic

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,PassengerId_Spouse,Name_Spouse,Age_Spouse
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,,,
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,,,
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,,,
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,,,
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,,,
6,0.0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,,,
7,0.0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,,,
8,0.0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S,,,
9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,,,
10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,,,


In [101]:
(titanic[titanic.PassengerId_Spouse.notnull()]
 [["Pclass", "Name", "Age", "Name_Spouse", "Age_Spouse"]])

Unnamed: 0_level_0,Pclass,Name,Age,Name_Spouse,Age_Spouse
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14,3,"Andersson, Mr. Anders Johan",39.0,"Andersson, Mrs. Anders Johan (Alfrida Konstant...",39.0
35,1,"Meyer, Mr. Edgar Joseph",28.0,"Meyer, Mrs. Edgar Joseph (Leila Saks)",
36,1,"Holverson, Mr. Alexander Oskar",42.0,"Holverson, Mrs. Alexander Oskar (Mary Aline To...",35.0
63,1,"Harris, Mr. Henry Birkhardt",45.0,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",35.0
93,1,"Chaffee, Mr. Herbert Fuller",46.0,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",47.0
100,2,"Kantor, Mr. Sinai",34.0,"Kantor, Mrs. Sinai (Miriam Sternin)",24.0
118,2,"Turpin, Mr. William John Robert",29.0,"Turpin, Mrs. William John Robert (Dorothy Ann ...",27.0
123,2,"Nasser, Mr. Nicholas",32.5,"Nasser, Mrs. Nicholas (Adele Achem)",14.0
138,1,"Futrelle, Mr. Jacques Heath",37.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
189,3,"Bourke, Mr. John",40.0,"Bourke, Mrs. John (Catherine)",32.0


In [102]:
titanic.Pclass.value_counts()

3    709
1    323
2    277
Name: Pclass, dtype: int64

In [103]:
titanic[titanic.PassengerId_Spouse.notnull()].groupby("Pclass").size()

Pclass
1    41
2    23
3    22
dtype: int64

In [104]:
titanic.Age - titanic.Age_Spouse

PassengerId
1       NaN
2       NaN
3       NaN
4       NaN
5       NaN
6       NaN
7       NaN
8       NaN
9       NaN
10      NaN
11      NaN
12      NaN
13      NaN
14      0.0
15      NaN
16      NaN
17      NaN
18      NaN
19      NaN
20      NaN
21      NaN
22      NaN
23      NaN
24      NaN
25      NaN
26      NaN
27      NaN
28      NaN
29      NaN
30      NaN
       ... 
1280    NaN
1281    NaN
1282    NaN
1283    NaN
1284    NaN
1285    NaN
1286    3.0
1287    NaN
1288    NaN
1289    NaN
1290    NaN
1291    NaN
1292    NaN
1293    NaN
1294    NaN
1295    NaN
1296    NaN
1297    NaN
1298    NaN
1299    0.0
1300    NaN
1301    NaN
1302    NaN
1303    NaN
1304    NaN
1305    NaN
1306    NaN
1307    NaN
1308    NaN
1309    NaN
Length: 1309, dtype: float64

In [105]:
(titanic.Age - titanic.Age_Spouse).groupby(titanic.Pclass).agg(["min", "max", "mean", "std", "count"])

Unnamed: 0_level_0,min,max,mean,std,count
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,-40.0,14.0,2.805556,9.211234,36
2,-2.0,20.0,6.195652,6.793612,23
3,-4.0,12.0,3.472222,3.798112,18


In [106]:
titanic.loc[742]

Survived                                                              0
Pclass                                                                1
Name                                      Cavendish, Mr. Tyrell William
Sex                                                                male
Age                                                                  36
SibSp                                                                 1
Parch                                                                 0
Ticket                                                            19877
Fare                                                              78.85
Cabin                                                               C46
Embarked                                                              S
PassengerId_Spouse                                                  988
Name_Spouse           Cavendish, Mrs. Tyrell William (Julia Florence...
Age_Spouse                                                      

In [107]:
(titanic.Age - titanic.Age_Spouse).sort_values()

PassengerId
742    -40.0
249    -10.0
725     -6.0
861     -4.0
622     -3.0
1082    -2.0
993     -2.0
315     -2.0
1170    -1.0
93      -1.0
207     -1.0
646     -1.0
1037     0.0
638      0.0
391      0.0
371      0.0
14       0.0
1299     0.0
1152     0.5
1245     1.0
1179     1.0
1144     1.0
1126     1.0
1064     1.0
912      1.0
749      1.0
506      1.0
118      2.0
309      2.0
549      2.0
        ... 
1278     NaN
1279     NaN
1280     NaN
1281     NaN
1282     NaN
1283     NaN
1284     NaN
1285     NaN
1287     NaN
1288     NaN
1289     NaN
1290     NaN
1291     NaN
1292     NaN
1293     NaN
1294     NaN
1295     NaN
1296     NaN
1297     NaN
1298     NaN
1300     NaN
1301     NaN
1302     NaN
1303     NaN
1304     NaN
1305     NaN
1306     NaN
1307     NaN
1308     NaN
1309     NaN
Length: 1309, dtype: float64

In [108]:
FTS_COLS = titanic.columns[1:-3].tolist()
FTS_COLS.remove("Cabin")

In [109]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 14 columns):
Survived              891 non-null float64
Pclass                1309 non-null int64
Name                  1309 non-null object
Sex                   1309 non-null object
Age                   1046 non-null float64
SibSp                 1309 non-null int64
Parch                 1309 non-null int64
Ticket                1309 non-null object
Fare                  1308 non-null float64
Cabin                 295 non-null object
Embarked              1307 non-null object
PassengerId_Spouse    86 non-null float64
Name_Spouse           86 non-null object
Age_Spouse            77 non-null float64
dtypes: float64(5), int64(3), object(6)
memory usage: 193.4+ KB


In [110]:
titanic[FTS_COLS].isnull().any(axis=1).sum()

266

In [111]:
titanic[(titanic.Age - titanic.Age_Spouse)<0][["Pclass", "Name", "Age", "Name_Spouse", "Age_Spouse"]]

Unnamed: 0_level_0,Pclass,Name,Age,Name_Spouse,Age_Spouse
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
93,1,"Chaffee, Mr. Herbert Fuller",46.0,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",47.0
207,3,"Backstrom, Mr. Karl Alfred",32.0,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gu...",33.0
249,1,"Beckwith, Mr. Richard Leonard",37.0,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",47.0
315,2,"Hart, Mr. Benjamin",43.0,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",45.0
622,1,"Kimball, Mr. Edwin Nelson Jr",42.0,"Kimball, Mrs. Edwin Nelson Jr (Gertrude Parsons)",45.0
646,1,"Harper, Mr. Henry Sleeper",48.0,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",49.0
725,1,"Chambers, Mr. Norman Campbell",27.0,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",33.0
742,1,"Cavendish, Mr. Tyrell William",36.0,"Cavendish, Mrs. Tyrell William (Julia Florence...",76.0
861,3,"Hansen, Mr. Claus Peter",41.0,"Hansen, Mrs. Claus Peter (Jennie L Howard)",45.0
993,2,"Weisz, Mr. Leopold",27.0,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",29.0
