In [1]:
import pandas as pd

titanic = pd.read_csv('Titanic Dataset.csv')

# Quick check
print(titanic.shape)    # (rows, columns)
print(titanic.head())   # First few rows

(1309, 14)
   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

     age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.00      0      0   24160  211.3375       B5        S    2    NaN   
1   0.92      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.00      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.00      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.00      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St L

In [3]:
# Get one column (returns a Series - like a single column)
names = titanic['name']
print(names.head())

# Also works with dot notation (if no spaces in column name)
ages = titanic.age
print(ages.head())


0                      Allen, Miss. Elisabeth Walton
1                     Allison, Master. Hudson Trevor
2                       Allison, Miss. Helen Loraine
3               Allison, Mr. Hudson Joshua Creighton
4    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
Name: name, dtype: object
0    29.00
1     0.92
2     2.00
3    30.00
4    25.00
Name: age, dtype: float64


In [5]:
# Select multiple columns (returns a DataFrame)
subset = titanic[['name', 'age', 'sex', 'survived']]
print(subset.head())

# Note the double brackets: [[ ]]
# Outer brackets = "select from titanic"
# Inner brackets = list of columns

                                              name    age     sex  survived
0                    Allen, Miss. Elisabeth Walton  29.00  female         1
1                   Allison, Master. Hudson Trevor   0.92    male         1
2                     Allison, Miss. Helen Loraine   2.00  female         0
3             Allison, Mr. Hudson Joshua Creighton  30.00    male         0
4  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  25.00  female         0


In [7]:
# Just passenger info
passenger_info = titanic[['name', 'sex', 'age']]

# Just survival data
survival_data = titanic[['name', 'survived', 'pclass']]

# You can then work with these smaller DataFrames
print(passenger_info.info())
print(survival_data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    1309 non-null   object 
 1   sex     1309 non-null   object 
 2   age     1046 non-null   float64
dtypes: float64(1), object(2)
memory usage: 30.8+ KB
None
          survived       pclass
count  1309.000000  1309.000000
mean      0.381971     2.294882
std       0.486055     0.837836
min       0.000000     1.000000
25%       0.000000     2.000000
50%       0.000000     3.000000
75%       1.000000     3.000000
max       1.000000     3.000000


In [9]:
# Create a condition (True/False for each row)
survived_condition = titanic['survived'] == 1

# Use condition to filter
survivors = titanic[survived_condition]

# Or combine in one line
survivors = titanic[titanic['survived'] == 1]

print(f"Total passengers: {len(titanic)}")
print(f"Survivors: {len(survivors)}")

Total passengers: 1309
Survivors: 500


In [11]:
# Step 1: Create a True/False mask
mask = titanic['age'] > 30
print(mask.head())  # Shows True, False, True, False, etc.

# Step 2: Use mask to filter
older_passengers = titanic[mask]

# The DataFrame keeps only rows where mask is True
print(older_passengers[['name', 'age']].head())

0    False
1    False
2    False
3    False
4    False
Name: age, dtype: bool
                                            name   age
5                            Anderson, Mr. Harry  48.0
6              Andrews, Miss. Kornelia Theodosia  63.0
7                         Andrews, Mr. Thomas Jr  39.0
8  Appleton, Mrs. Edward Dale (Charlotte Lamson)  53.0
9                        Artagaveytia, Mr. Ramon  71.0


In [17]:
# Sort by age (youngest first)
sorted_by_age = titanic.sort_values('age')
print(sorted_by_age[['name', 'age']].head())

# Sort by age (oldest first)
sorted_by_age_desc = titanic.sort_values('age', ascending=False)
print(sorted_by_age_desc[['name', 'age']].head())

# Sort by multiple columns
sorted_multiple = titanic.sort_values(['pclass', 'fare'], 
                                       ascending=[True, False])
# First by class (low to high), then by fare (high to low)
print(sorted_multiple[['pclass', 'fare', 'name', 'survived']].head())

                                         name   age
763   Dean, Miss. Elizabeth Gladys "Millvina"  0.17
747   Danbom, Master. Gilbert Sigvard Emanuel  0.33
1240          Thomas, Master. Assad Alexander  0.42
427                 Hamalainen, Master. Viljo  0.67
1111           Peacock, Master. Alfred Edward  0.75
                                                   name   age
14                 Barkworth, Mr. Algernon Henry Wilson  80.0
61    Cavendish, Mrs. Tyrell William (Julia Florence...  76.0
1235                                Svensson, Mr. Johan  74.0
135                           Goldschmidt, Mr. George B  71.0
9                               Artagaveytia, Mr. Ramon  71.0
     pclass      fare                                               name  \
49        1  512.3292                 Cardeza, Mr. Thomas Drake Martinez   
50        1  512.3292  Cardeza, Mrs. James Warburton Martinez (Charlo...   
183       1  512.3292                             Lesurer, Mr. Gustave J   
302       1 

In [19]:
# Who were the youngest passengers?
youngest = titanic.sort_values('age')[['name', 'age']].head(10)

# Who paid the highest fares?
highest_fares = titanic.sort_values('fare', ascending=False)[['name', 'fare']].head(10)

# Organize by class, then by fare within each class
organized = titanic.sort_values(['pclass', 'fare'], 
                                 ascending=[True, False])
print(organized[['name', 'pclass', 'fare']].head(20))

                                                  name  pclass      fare
49                  Cardeza, Mr. Thomas Drake Martinez       1  512.3292
50   Cardeza, Mrs. James Warburton Martinez (Charlo...       1  512.3292
183                             Lesurer, Mr. Gustave J       1  512.3292
302                                   Ward, Miss. Anna       1  512.3292
111                     Fortune, Miss. Alice Elizabeth       1  263.0000
112                         Fortune, Miss. Ethel Flora       1  263.0000
113                         Fortune, Miss. Mabel Helen       1  263.0000
114                     Fortune, Mr. Charles Alexander       1  263.0000
115                                  Fortune, Mr. Mark       1  263.0000
116                Fortune, Mrs. Mark (Mary McDougald)       1  263.0000
35                            Bowen, Miss. Grace Scott       1  262.3750
66                         Chaudanson, Miss. Victorine       1  262.3750
249                        Ryerson, Master. John Bo

In [23]:
# Basic statistics on a column
print("Average age:", titanic['age'].mean())
print("Minimum fare:", titanic['fare'].min())
print("Maximum fare:", titanic['fare'].max())
print("Total passengers:", len(titanic))

# Count non-missing values
print("Passengers with known age:", titanic['age'].count())

# Standard deviation
print("Age std deviation:", titanic['age'].std())

Average age: 29.881137667304014
Minimum fare: 0.0
Maximum fare: 512.3292
Total passengers: 1309
Passengers with known age: 1046
Age std deviation: 14.413493211271337


In [25]:
# Average age of survivors
survivors = titanic[titanic['survived'] == 1]
print("Average survivor age:", survivors['age'].mean())

# Average fare by class
first_class = titanic[titanic['pclass'] == 1]
print("Average first class fare:", first_class['fare'].mean())

third_class = titanic[titanic['pclass'] == 3]
print("Average third class fare:", third_class['fare'].mean())

# Survival rate (percentage)
survival_rate = titanic['survived'].mean() * 100
print(f"Survival rate: {survival_rate:.1f}%")


Average survivor age: 28.91824355971897
Average first class fare: 87.50899164086688
Average third class fare: 13.302888700564973
Survival rate: 38.2%


In [27]:
# How many passengers in each class?
print(titanic['pclass'].value_counts())
# Output:
# 3    709
# 1    323
# 2    277

# Gender distribution
print(titanic['sex'].value_counts())
# Survival counts
print(titanic['survived'].value_counts())
# 0 = did not survive
# 1 = survived

pclass
3    709
1    323
2    277
Name: count, dtype: int64
sex
male      843
female    466
Name: count, dtype: int64
survived
0    809
1    500
Name: count, dtype: int64


In [29]:
# Pattern 1: Filter → Count
num_first_class = len(titanic[titanic['pclass'] == 1])

# Pattern 2: Filter → Calculate statistic
avg_survivor_age = titanic[titanic['survived'] == 1]['age'].mean()

# Pattern 3: Filter → Select → Sort → Display
titanic[titanic['survived'] == 1][['name', 'age']].sort_values('age').head()

# Pattern 4: Value distribution
titanic['pclass'].value_counts()

# Pattern 5: Overall statistics
titanic['fare'].describe()

count    1308.000000
mean       33.295479
std        51.758668
min         0.000000
25%         7.895800
50%        14.454200
75%        31.275000
max       512.329200
Name: fare, dtype: float64

In [31]:
# Get female survivors, show name/age/class, sort by age
result = titanic[
    (titanic['sex'] == 'female') & 
    (titanic['survived'] == 1)
][['name', 'age', 'pclass']].sort_values('age')

print(result.head(10))

# Breaking it down:
# 1. Filter: female survivors
# 2. Select: name, age, pclass columns
# 3. Sort: by age
# 4. Display: first 10 rows


                                         name   age  pclass
763   Dean, Miss. Elizabeth Gladys "Millvina"  0.17       3
657                    Baclini, Miss. Eugenie  0.75       3
658             Baclini, Miss. Helene Barbara  0.75       3
590                     West, Miss. Barbara J  0.92       2
895              Johnson, Miss. Eleanor Ileen  1.00       3
1187          Sandstrom, Miss. Beatrice Irene  1.00       3
478                     Laroche, Miss. Louise  1.00       2
1048              Nakid, Miss. Maria ("Mary")  1.00       3
540                  Quick, Miss. Phyllis May  2.00       2
866                  Hirvonen, Miss. Hildur E  2.00       3


In [33]:
survival_by_class = titanic.groupby('pclass')['survived'].mean() * 100
print(survival_by_class)

pclass
1    61.919505
2    42.960289
3    25.528914
Name: survived, dtype: float64


In [49]:
# Pattern 1: Filter → Count
num_first_class = len(titanic[titanic['pclass'] == 1])
print(f"Number of persons in first class: {num_first_class}")

# Pattern 2: Filter → Calculate statistic
avg_survivor_age = titanic[titanic['survived'] == 1]['age'].mean()
print(f"Avg Survivor Age: {avg_survivor_age}")

# Pattern 3: Filter → Select → Sort → Display
print(titanic[titanic['survived'] == 1][['name', 'age']].sort_values('age').head())

# Pattern 4: Value distribution
print(titanic['pclass'].value_counts())

# Pattern 5: Overall statistics
titanic['fare'].describe()

Number of persons in first class: 323
Avg Survivor Age: 28.91824355971897
                                         name   age
763   Dean, Miss. Elizabeth Gladys "Millvina"  0.17
1240          Thomas, Master. Assad Alexander  0.42
427                 Hamalainen, Master. Viljo  0.67
657                    Baclini, Miss. Eugenie  0.75
658             Baclini, Miss. Helene Barbara  0.75
pclass
3    709
1    323
2    277
Name: count, dtype: int64


count    1308.000000
mean       33.295479
std        51.758668
min         0.000000
25%         7.895800
50%        14.454200
75%        31.275000
max       512.329200
Name: fare, dtype: float64