In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = sns.load_dataset("titanic")

In [4]:
df["fare"]

0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: fare, Length: 891, dtype: float64

In [5]:
df[["sex", "age", "fare"]]

Unnamed: 0,sex,age,fare
0,male,22.0,7.2500
1,female,38.0,71.2833
2,female,26.0,7.9250
3,female,35.0,53.1000
4,male,35.0,8.0500
...,...,...,...
886,male,27.0,13.0000
887,female,19.0,30.0000
888,female,,23.4500
889,male,26.0,30.0000


In [6]:
df.iloc[0:5]  # first 5 rows

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [7]:
df.loc[0:5, ["sex", "age"]]

Unnamed: 0,sex,age
0,male,22.0
1,female,38.0
2,female,26.0
3,female,35.0
4,male,35.0
5,male,


The difference between `.iloc` and `.loc` in pandas lies in how they access rows and columns in a DataFrame:

1. **`.iloc`**:
    - Stands for **integer-location based indexing**.
    - Accesses rows and columns by their **integer positions** (zero-based index).
    - Example: `df.iloc[0:5, 1:3]` selects rows 0 to 4 and columns 1 to 2.

2. **`.loc`**:
    - Stands for **label-based indexing**.
    - Accesses rows and columns by their **labels** or boolean conditions.
    - Example: `df.loc[0:5, ['sex', 'age']]` selects rows with labels 0 to 5 and columns `'sex'` and `'age'`.

In summary:
- Use `.iloc` when you want to select by position.
- Use `.loc` when you want to select by labels or conditions.

In [8]:
df[df["age"] > 30]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,0,3,male,47.0,0,0,9.0000,S,Third,man,True,,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
881,0,3,male,33.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False


In [9]:
df[(df["fare"] > 50) & (df["sex"] == "female")]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
31,1,1,female,,1,0,146.5208,C,First,woman,False,B,Cherbourg,yes,False
52,1,1,female,49.0,1,0,76.7292,C,First,woman,False,D,Cherbourg,yes,False
61,1,1,female,38.0,0,0,80.0000,,First,woman,False,B,,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849,1,1,female,,1,0,89.1042,C,First,woman,False,C,Cherbourg,yes,False
856,1,1,female,45.0,1,1,164.8667,S,First,woman,False,,Southampton,yes,False
863,0,3,female,,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False


In [10]:
df[~df["embarked"].isna()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [11]:
df[df["embarked"].isin(["S", "C"])]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0,3,male,25.0,0,0,7.0500,S,Third,man,True,,Southampton,no,True
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False


In [12]:
df[df["age"].notna()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [13]:
df[(df["age"] < 10) & (df["pclass"]!=3)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
43,1,2,female,3.0,1,2,41.5792,C,Second,child,False,,Cherbourg,yes,False
58,1,2,female,5.0,1,2,27.75,S,Second,child,False,,Southampton,yes,False
78,1,2,male,0.83,0,2,29.0,S,Second,child,False,,Southampton,yes,False
183,1,2,male,1.0,2,1,39.0,S,Second,child,False,F,Southampton,yes,False
193,1,2,male,3.0,1,1,26.0,S,Second,child,False,F,Southampton,yes,False
237,1,2,female,8.0,0,2,26.25,S,Second,child,False,,Southampton,yes,False
297,0,1,female,2.0,1,2,151.55,S,First,child,False,C,Southampton,no,False
305,1,1,male,0.92,1,2,151.55,S,First,child,False,C,Southampton,yes,False
340,1,2,male,2.0,1,1,26.0,S,Second,child,False,F,Southampton,yes,False
407,1,2,male,3.0,1,1,18.75,S,Second,child,False,,Southampton,yes,False


In [16]:
df["age_squared"] = df["age"] ** 2
df["age_squared"]

0       484.0
1      1444.0
2       676.0
3      1225.0
4      1225.0
        ...  
886     729.0
887     361.0
888       NaN
889     676.0
890    1024.0
Name: age_squared, Length: 891, dtype: float64

In [18]:
df["fare_per_sibsp"] = df["fare"] / (df["sibsp"] + 1)
df["fare_per_sibsp"]

0       3.62500
1      35.64165
2       7.92500
3      26.55000
4       8.05000
         ...   
886    13.00000
887    30.00000
888    11.72500
889    30.00000
890     7.75000
Name: fare_per_sibsp, Length: 891, dtype: float64

In [20]:
df.columns = df.columns.str.lower().str.replace(" ", "_")
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_squared,fare_per_sibsp
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,484.0,3.62500
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1444.0,35.64165
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,676.0,7.92500
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,1225.0,26.55000
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,1225.0,8.05000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,729.0,13.00000
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,361.0,30.00000
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,,11.72500
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,676.0,30.00000


In [29]:
df.rename(columns={"fare": "ticket_fare"}, inplace=False)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket_fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_squared,fare_per_sibsp,high_fare
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,484.0,3.62500,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1444.0,35.64165,True
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,676.0,7.92500,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,1225.0,26.55000,True
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,1225.0,8.05000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,729.0,13.00000,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,361.0,30.00000,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,,11.72500,True
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,676.0,30.00000,True


In [31]:
median_fare = df["fare"].median()
median_fare

14.4542

In [32]:
df["high_fare"] = df["fare"] > median_fare
df["high_fare"]

0      False
1       True
2      False
3       True
4      False
       ...  
886    False
887     True
888     True
889     True
890    False
Name: high_fare, Length: 891, dtype: bool

## New Column: high_fare

The `high_fare` column helps distinguish passengers who paid **above-median ticket prices**.  
This makes it easy to compare survival rates, passenger class, or gender breakdowns between higher-paying and lower-paying groups.  

For example, I can now explore:

- Were high-fare passengers more likely to survive?
- Does `high_fare` correlate with 1st class tickets?
- Are certain embarkation points more common among high-fare passengers?