In [None]:
# https://blog.socialcops.com/engineering/machine-learning-python/
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (15.0, 8.0)

In [None]:
!wget https://raw.githubusercontent.com/davewadestein/Gap-Python-2025/refs/heads/main/Data/titanic3.xls

# Let's read in the Titanic data for a deeper analysis

In [None]:
data = pd.read_excel('titanic3.xls', 'titanic3', index_col=None, na_values=['NA'])

## Legend
* pclass = Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* survival (0 = No; 1 = Yes)
* sibsp = number of Siblings/Spouses Aboard
* parch = number of Parents/Children Aboard
* ticket = ticket Number
* embarked (from...C = Cherbourg; Q = Queenstown; S = Southampton)
* boat = Lifeboat ID
* body = ID Number
* home.dest = Home/Destination

In [None]:
data.head()

### These columns are unlikely to be meaningful and have several missing values so we'll drop them.

In [None]:
data = data.drop(['boat', 'ticket', 'cabin', 'body'], axis=1)

### We can get a quick summary of how many people survived.

In [None]:
data['survived'].value_counts()

### Because we encode survival as '1', finding the mean will give us a survival percentage.

In [None]:
data['survived'].mean()

### If we group by class and then compute the mean, what does this reveal?

In [None]:
data.groupby('pclass').mean(numeric_only=True)

### Pandas groupby() method essentially creates a bunch of dataframes in which the all the columns match the various values of the grouping variables...
* we can see this by looking at the __`groups`__ attribute
* each grouping contains the rows of the dataframe
* so in the example below, rows 0, 2, 4, 6, 8, etc. are the females in 1st class

In [None]:
data.groupby(['pclass', 'sex']).groups

### If we group by class and gender and compute the mean, what does this tell us about the "Women" part of "Women and children first?"

In [None]:
class_gender_grouping = data.groupby(['pclass', 'sex']).mean(numeric_only=True)
class_gender_grouping

### Let's render it as a bar chart to make it clear...

In [None]:
class_gender_grouping.survived.plot.bar();

### If we investigate the passengers by age, we can investigate the "Children" part of "Women and children first".

In [None]:
group_by_age = pd.cut(data.age, np.arange(0, 90, 10))
age_grouping = data.groupby(group_by_age, observed=False)['survived'].mean(numeric_only=True)

In [None]:
age_grouping.plot.bar();

In [None]:
group_by_age.value_counts()

### We have a fair amount of missing values (e.g., __`age`__, __`home.dest`__)

In [None]:
data.count()

### We can fill in the missing age values with average values. Is this a good strategy for this data? What could we exploit to make a better go of it? (Hint: Look at the class breakdowns above)

In [None]:
data = data.fillna(data.mean(numeric_only=True))

### Now we are basically missing home/destination data and a few embarked results

In [None]:
data.count()

### With only two results missing embarked data, we can probably just drop those samples.

In [None]:
data[data['embarked'].isnull()]

### There are a substantial number of missing home/destination values. We don't know if that is an important feature, but we'd lose a lot of data if we throw those samples away, so let's fill it in with placeholders for now.

In [None]:
data["home.dest"] = data["home.dest"].fillna("NA")

### Now with only the two embarked rows missing, we are in good shape, so let's drop those and then move on.

In [None]:
data.count()

In [None]:
data = data.dropna()

In [None]:
data.count()

### We are going to turn categorical data (__`sex`__ and __`embarked`__) into numbered values using a Sci-Kit Learn __`LabelEncoder`__.

In [None]:
from sklearn import datasets, model_selection, tree, preprocessing, metrics
import sklearn.ensemble as ske

In [None]:
def preprocess_titanic_df(df):
    processed_df = df.copy()
    le = preprocessing.LabelEncoder()

    processed_df.sex = le.fit_transform(processed_df.sex)
    processed_df.embarked = le.fit_transform(processed_df.embarked)
    processed_df = processed_df.drop(['name', 'home.dest'],axis=1)

    return processed_df

### We preprocess our data and then verify that everything is suitable for a learning activity now.

In [None]:
processed_df = preprocess_titanic_df(data)
processed_df.head()

### The survival data is going to be our target so we drop it from the feature matrix and set up the target matrix.

In [None]:
X = processed_df.drop(['survived'], axis=1)
y = processed_df['survived']

### We now split our data into training and test data, create a DecisionTreeClassifier and then see how we do on predictions on survival.

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

In [None]:
clf_dt = tree.DecisionTreeClassifier(max_depth=2)

In [None]:
clf_dt.fit(X_train, y_train)
clf_dt.score(X_test, y_test)

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(clf_dt, out_file="titanic.dot",
               feature_names='pclass sex age sibsp parch fare embarked'.split(),
               class_names='perished survived'.split(),
               rounded=True,filled=True)

In [None]:
!dot -Tpng titanic.dot -o titanic.png
from IPython.display import Image
Image('titanic.png')

### Shuffling the data can sometimes improve our results.

In [None]:
shuffle_validator = model_selection.ShuffleSplit(len(X), test_size=0.2, random_state=0)

def test_classifier(clf):
    scores = model_selection.cross_val_score(clf, X, y, cv=shuffle_validator)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
test_classifier(clf_dt)

### A RandomForestClassifier often does better.

In [None]:
clf_rf = ske.RandomForestClassifier(n_estimators=50)
test_classifier(clf_rf)

### Other classifiers might do even better...

In [None]:
clf_gb = ske.GradientBoostingClassifier(n_estimators=50)
test_classifier(clf_gb)

### Now we want to make a specific point, so we'll grab the first twenty rows of each passenger class from our raw data, clean them up and use them as our test data.

In [None]:
passengers_set_1 = data[data.pclass == 1].iloc[:20,:].copy()
passengers_set_2 = data[data.pclass == 2].iloc[:20,:].copy()
passengers_set_3 = data[data.pclass == 3].iloc[:20,:].copy()
passenger_set = pd.concat([passengers_set_1, passengers_set_2, passengers_set_3])
testing_set = preprocess_titanic_df(passenger_set)

### Normally you wouldn't train and test on the same data, but we're trying to make a point (and this is a closed system–we're not going to use our model on new, unseen data!)

In [None]:
training_set = pd.concat([data, passenger_set]).drop_duplicates(keep=False)
training_set = preprocess_titanic_df(training_set)

In [None]:
X = training_set.drop(['survived'], axis=1).values
y = training_set['survived'].values
X_test = testing_set.drop(['survived'], axis=1).values
y_test = testing_set['survived'].values

In [None]:
clf_rf = ske.RandomForestClassifier(n_estimators=50)
clf_rf.fit(X, y)

In [None]:
prediction = clf_rf.predict(X_test)
prediction

### Because this is historical data, we can compare predictions to what actually happened.
* Models are not destiny!
* ...some of the people our model thought would survive didn't and some of the people it thought wouldn't did.

In [None]:
passenger_set[passenger_set.survived != prediction]

In [None]:
clf_rf.score(X_test, y_test)