In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from apyori import apriori
import ARutils
import plotly.express as px
import plotly.io as pio
 
%matplotlib inline

The survival rate is simply the number of people that survived divided by the number of people on board.
The question states

> In this exercise, we ask you to complete the analysis of what sorts of people were likely to survive. In particular, we ask you to apply the Association Rule mining to predict which passengers survived from the tragedy

This means that we have to find the people that survived taking into account there age, sex, and passenger class. The first row of the data has the following information when filtered by `Yes` and `No`

> ['Age_Adult', 'Sex_Female', 'Crew_Member']

This is an `adult`, `female`, `crew member` that did NOT survive.

In [2]:
titanic_df = pd.read_csv('titanic_preprocessed.csv')
titanic_df

Unnamed: 0,Age_Adult,Age_Child,Sex_Female,Sex_Male,Survived,Crew_Member,First_class,Second_class,Third_class
0,Yes,No,Yes,No,No,Yes,No,No,No
1,Yes,No,Yes,No,No,Yes,No,No,No
2,Yes,No,Yes,No,No,Yes,No,No,No
3,Yes,No,Yes,No,Yes,Yes,No,No,No
4,Yes,No,Yes,No,Yes,Yes,No,No,No
...,...,...,...,...,...,...,...,...,...
2196,Yes,No,No,Yes,Yes,No,No,No,Yes
2197,Yes,No,No,Yes,Yes,No,No,No,Yes
2198,Yes,No,No,Yes,Yes,No,No,No,Yes
2199,Yes,No,No,Yes,Yes,No,No,No,Yes


In [3]:
items_to_print = 5

# first, convert the dataframe into a suitable format for apyori - an array of arrays
titanic_prepared = ARutils.data_prepare(titanic_df)
for each in titanic_prepared[0: items_to_print]:
    print(each, end='\n\n')
print('**********'*10)

# next order of business is to apply apriori algorithm to find the frequent itemsets: Support -> ; Confidence:
titanic_rules = list(apriori(titanic_prepared))
print(f'Number of frequent itemsets = {len(titanic_rules)} rules')
for each in titanic_rules[0: items_to_print]:
    print(each, end='\n\n')
print('**********'*10)

# next, we create all the possible association rules from the generated list
titanic_assc_rules = ARutils.extract(titanic_rules)
print(f'Generated association rules = {len(titanic_assc_rules)}')
for each in titanic_rules[0: items_to_print]:
    print(each, end='\n\n')
print('**********'*10)

['Age_Adult', 'Sex_Female', 'Crew_Member']

['Age_Adult', 'Sex_Female', 'Crew_Member']

['Age_Adult', 'Sex_Female', 'Crew_Member']

['Age_Adult', 'Sex_Female', 'Survived', 'Crew_Member']

['Age_Adult', 'Sex_Female', 'Survived', 'Crew_Member']

****************************************************************************************************
Number of frequent itemsets = 23 rules
RelationRecord(items=frozenset({'Age_Adult'}), support=0.9504770558836892, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Age_Adult'}), confidence=0.9504770558836892, lift=1.0)])

RelationRecord(items=frozenset({'Crew_Member'}), support=0.4020899591094957, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'Crew_Member'}), confidence=0.4020899591094957, lift=1.0)])

RelationRecord(items=frozenset({'First_class'}), support=0.14766015447523853, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'First_class'}), con

In [4]:
titanic_assc_df = pd.DataFrame(titanic_assc_rules, columns=['LHS', 'RHS', 'Support', 'Confidence', 'Lift'])
titanic_assc_df[titanic_assc_df['RHS'].apply(lambda x: 'Survived' in x)].sort_values(by='Lift', ascending=False)

Unnamed: 0,LHS,RHS,Support,Confidence,Lift
52,"[Sex_Female, Age_Adult]",[Survived],0.143571,0.743529,2.301699
33,[Sex_Female],[Survived],0.156293,0.731915,2.265745
50,[Sex_Female],"[Age_Adult, Survived]",0.143571,0.67234,2.262724
6,[],[Survived],0.323035,0.323035,1.0
23,[],"[Age_Adult, Survived]",0.297138,0.297138,1.0
32,[],"[Sex_Female, Survived]",0.156293,0.156293,1.0
35,[],"[Sex_Male, Survived]",0.166742,0.166742,1.0
48,[],"[Sex_Female, Age_Adult, Survived]",0.143571,0.143571,1.0
55,[],"[Sex_Male, Age_Adult, Survived]",0.153567,0.153567,1.0
56,[Age_Adult],"[Sex_Male, Survived]",0.153567,0.161568,0.968967


# Conclusion
From the above chart we see that the people most likely to survive are adult females because they have the highest lift values