In [1]:
import pandas as pd

# Load the real Titanic dataset from a public URL
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Let's see what we are dealing with
print("Dataset Shape:", df.shape)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nFirst 5 rows:")
print(df.head())

Dataset Shape: (891, 12)

Missing Values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ti

In [2]:
# We group by 'Pclass' and look at the 'Survived' column
# We take the .mean() because the mean of 0s and 1s is the % of survival
pattern = df.groupby('Pclass')['Survived'].mean()
print(pattern)

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64


In [3]:
# Finding the pattern for Sex and Class together
pivot_pattern = df.groupby(['Sex', 'Pclass'])['Survived'].mean()
print(pivot_pattern)

Sex     Pclass
female  1         0.968085
        2         0.921053
        3         0.500000
male    1         0.368852
        2         0.157407
        3         0.135447
Name: Survived, dtype: float64


In [None]:
Your Task:

Use .groupby() on the SibSp column.

Calculate the .mean() of Survived.

Look at the result: Do people with 0 siblings survive more or less than people with 1 or 2 siblings?

In [4]:
pattern = df.groupby(['SibSp'])['Survived'].mean()
print(pattern)

SibSp
0    0.345395
1    0.535885
2    0.464286
3    0.250000
4    0.166667
5    0.000000
8    0.000000
Name: Survived, dtype: float64


In [5]:
# We only correlate numbers. 
# 'Survived' is our target.
correlations = df.corr(numeric_only=True)['Survived'].sort_values(ascending=False)
print(correlations)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64


In [None]:
Your Task:
Calculate $Q1$ and $Q3$:Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
Calculate the IQR = Q3 - Q1.
Calculate the upper_bound = Q3 + (1.5 * IQR).
Count how many people paid more than the upper bound: len(df[df['Fare'] > upper_bound])

In [8]:
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + (1.5 * IQR)
rich = len(df[df['Fare'] > upper_bound])
print(rich)

116


In [9]:
# Create a new column 'FamilySize'
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1 # +1 for themselves

# Create a pattern check
print(df.groupby('FamilySize')['Survived'].mean())

FamilySize
1     0.303538
2     0.552795
3     0.578431
4     0.724138
5     0.200000
6     0.136364
7     0.333333
8     0.000000
11    0.000000
Name: Survived, dtype: float64


In [11]:
# If family size is 1, they are alone (1) else (0)
df['IsAlone'] = 0
alone = df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
print(alone)

1


In [12]:
# 1. Define the 'bins' (the cut-off points)
# 0-12: Child, 12-18: Teen, 18-60: Adult, 60-80: Senior
bins = [0, 12, 18, 60, 80]
labels = ['Child', 'Teen', 'Adult', 'Senior']

# 2. Create the new column
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

# 3. Check the pattern!
print(df.groupby('AgeGroup')['Survived'].mean())

AgeGroup
Child     0.579710
Teen      0.428571
Adult     0.388788
Senior    0.227273
Name: Survived, dtype: float64


  print(df.groupby('AgeGroup')['Survived'].mean())


In [13]:
df['Title'] = df['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0].str.strip()

In [14]:
new_title = df['Title'].value_counts()

In [16]:
meena = df.groupby('Title')['Survived'].mean()
print(meena)

Title
Capt            0.000000
Col             0.500000
Don             0.000000
Dr              0.428571
Jonkheer        0.000000
Lady            1.000000
Major           0.500000
Master          0.575000
Miss            0.697802
Mlle            1.000000
Mme             1.000000
Mr              0.156673
Mrs             0.792000
Ms              1.000000
Rev             0.000000
Sir             1.000000
the Countess    1.000000
Name: Survived, dtype: float64


In [17]:
# See how Title and Sex overlap
print(pd.crosstab(df['Title'], df['Sex']))

Sex           female  male
Title                     
Capt               0     1
Col                0     2
Don                0     1
Dr                 1     6
Jonkheer           0     1
Lady               1     0
Major              0     2
Master             0    40
Miss             182     0
Mlle               2     0
Mme                1     0
Mr                 0   517
Mrs              125     0
Ms                 1     0
Rev                0     6
Sir                0     1
the Countess       1     0


In [18]:
# Replace rare titles with 'Rare'
df['Title'] = df['Title'].replace(['Lady', 'the Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

# Fix synonyms
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

# Now look at the clean pattern
print(df.groupby('Title')['Survived'].mean())

Title
Master    0.575000
Miss      0.702703
Mr        0.156673
Mrs       0.793651
Rare      0.347826
Name: Survived, dtype: float64


In [None]:
Your Task:

Find out how many people paid a fare of 0: len(df[df['Fare'] == 0]).

Look at their names or ages: df[df['Fare'] == 0].

The "Why" Question: Who are these people? Are they crew members, or did they get free tickets? And most importantly, did they survive?

In [30]:
free = len(df[df['Fare'] == 0])
# Returns a DataFrame with Name and Age for everyone with Fare == 0
free_loaders_info = df.loc[df['Fare'] == 0, ['Name', 'Age']]
print(free_loaders_info)

                                 Name   Age
179               Leonard, Mr. Lionel  36.0
263             Harrison, Mr. William  40.0
271      Tornquist, Mr. William Henry  25.0
277       Parkes, Mr. Francis "Frank"   NaN
302   Johnson, Mr. William Cahoone Jr  19.0
413    Cunningham, Mr. Alfred Fleming   NaN
466             Campbell, Mr. William   NaN
481  Frost, Mr. Anthony Wood "Archie"   NaN
597               Johnson, Mr. Alfred  49.0
633     Parr, Mr. William Henry Marsh   NaN
674        Watson, Mr. Ennis Hastings   NaN
732              Knight, Mr. Robert J   NaN
806            Andrews, Mr. Thomas Jr  39.0
815                  Fry, Mr. Richard   NaN
822   Reuchlin, Jonkheer. John George  38.0


In [31]:
# Create a column that is True if Age is missing
df['Age_is_Missing'] = df['Age'].isnull()

# See the percentage of missing ages per Class
print(df.groupby('Pclass')['Age_is_Missing'].mean())

Pclass
1    0.138889
2    0.059783
3    0.276986
Name: Age_is_Missing, dtype: float64


In [32]:
# Create a pivot table to see survival by AgeGroup and Pclass
survival_matrix = df.pivot_table(values='Survived', index='AgeGroup', columns='Pclass')
print(survival_matrix)

Pclass           1         2         3
AgeGroup                              
Child     0.750000  1.000000  0.416667
Teen      0.916667  0.500000  0.282609
Adult     0.673077  0.418440  0.199219
Senior    0.214286  0.333333  0.200000


  survival_matrix = df.pivot_table(values='Survived', index='AgeGroup', columns='Pclass')


In [33]:
# 'Embarked' is the port: C = Cherbourg, Q = Queenstown, S = Southampton
print(df.groupby('Embarked')['Survived'].mean())

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64


In [34]:
print(df.groupby('Embarked')['Fare'].mean())

Embarked
C    59.954144
Q    13.276030
S    27.079812
Name: Fare, dtype: float64


In [35]:
# Create a feature: 1 if they had a Cabin recorded, 0 if it was NaN (missing)
df['HasCabin'] = df['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

# Check the survival pattern
print(df.groupby('HasCabin')['Survived'].mean())


HasCabin
0    0.299854
1    0.666667
Name: Survived, dtype: float64
