In [1]:
from io import StringIO

import pandas as pd
import requests

In [2]:
titanic_request = requests.get("https://titanicfacts.net/titanic-passenger-list")
titanic_request.raise_for_status()

In [3]:
tables = pd.read_html(StringIO(titanic_request.text), header=0)
len(tables)

3

In [4]:
passengers = pd.concat(tables, ignore_index=True)
assert len(passengers) == sum(len(t) for t in tables)

In [5]:
passengers["name"] = passengers["Surname"] + ", " + passengers["First Names"]
passengers.head()

Unnamed: 0,Surname,First Names,Age,Boarded,Survivor (S) or Victim (†),name
0,Allen,Miss Elisabeth Walton,29,Southampton,S,"Allen, Miss Elisabeth Walton"
1,Allison,Mr Hudson Joshua Creighton,30,Southampton,†,"Allison, Mr Hudson Joshua Creighton"
2,Allison,Mrs Bessie Waldo,25,Southampton,†,"Allison, Mrs Bessie Waldo"
3,Allison,Miss Helen Loraine,2,Southampton,†,"Allison, Miss Helen Loraine"
4,Allison,Master Hudson Trevor,11m,Southampton,S,"Allison, Master Hudson Trevor"


In [6]:
passengers.to_parquet("./data/passengers.parquet")

In [7]:
passengers[passengers["name"] == "Braund, Mr Owen Harris"]

Unnamed: 0,Surname,First Names,Age,Boarded,Survivor (S) or Victim (†),name
692,Braund,Mr Owen Harris,22,Southampton,†,"Braund, Mr Owen Harris"


In [8]:
def did_they_survive(name):
    name = "Braund, Mr. Owen Harris"

    # remove . character
    name = name.replace(".", "")

    # look up the person in the passenger data
    passenger = passengers[passengers["name"] == name]

    survival = passenger["Survivor (S) or Victim (†)"].values[0]

    return survival == "S"


did_they_survive("Braund, Mr. Owen Harris")

False

In [14]:
from Levenshtein import distance


def get_closest_match(name):
    # Convert series to list and remove any non-string or empty values
    valid_strings = passengers["name"].dropna().astype(str).tolist()

    if not valid_strings:
        raise ValueError("Input series is empty or contains no valid strings")

    # Calculate distances for all strings
    distances = [(s, distance(name, s)) for s in valid_strings]

    # Find the string with minimum distance
    closest_match = min(distances, key=lambda x: x[1])[0]

    return closest_match


get_closest_match("Braund, Mr. Owen Harris")

'Braund, Mr Owen Harris'

In [10]:
passengers.to_parquet("./data/passengers.parquet")