# Individual Project - Titanic


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_titanic = pd.read_csv("C:\\Users\\Emmanuel Dadson\\Desktop\\Individual-Project-titanic\\Data\\Titanic_data.csv")

In [5]:
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,$7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,$71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,$7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,$53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,$8.05,,S


In [6]:
#check all missing data
df_titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Deal with Missing Data
We will demonstrate filling with mean/mode and estimate from other columns.

#### Fill with Mean/Mode
Embarked only has 2 missing values and there is no obvious way to estimate the missing walue, we will simply fill it with mode of the column, or 'S'

In [7]:
#.fillna(df_titanic.Embarked.mode()[0],inplace=True)

df_titanic.fillna({"Embarked":df_titanic["Embarked"].mode()[0]}, inplace = True)

#### Fill with Estimated Value

A title is a word used in a person's name, in certain contexts. It may signify either veneration, an official position, or a professional or academic qualification. It's a good indication of age, for example, Mr is for adult man, Master is for young boys.

If we look at all names of Titanic passengers, we can see that the name is in format Last, Title. First. We can use this information to estimate missing ages.

- First, we will use regular expression to extract title from name.
- Then we will convert title to upper case.
- Then we fill missing age with mean age of specific title.

In [8]:
#extract prefix from name
#df_titanic.Name.str.extract('([A-Za-z]+\.)')

title = df_titanic["Name"].str.extract("([A-Za-z]+\.)")

title

Unnamed: 0,0
0,Mr.
1,Mrs.
2,Miss.
3,Mrs.
4,Mr.
...,...
886,Rev.
887,Miss.
888,Miss.
889,Mr.


##### Convert title to upper case.
To ensure we get accurate mean age of each initial, we convert initial to all upper case.

In [9]:
# Join title to titanic dataframe and assign it to new_df (NB: This helps to get access to the .str accessor)
new_df = df_titanic.join(title, how = "outer")

# Rename column
new_df.rename(columns = {0:"title"}, inplace = True)

# Convert entries in title column to upper case
new_df["title"] = new_df["title"].str.upper()

##### Fill missing age with mean age of the title

In [10]:
# df_titanic.Title.value_counts()

new_df["title"].value_counts()

title
MR.          517
MISS.        182
MRS.         125
MASTER.       40
DR.            7
REV.           6
MLLE.          2
MAJOR.         2
COL.           2
COUNTESS.      1
CAPT.          1
MS.            1
SIR.           1
LADY.          1
MME.           1
DON.           1
JONKHEER.      1
Name: count, dtype: int64

In [11]:
# .fillna(df_titanic.groupby('Title').Age.transform('mean'), inplace=True)

new_df.fillna({"Age":new_df.groupby(by = "title")["Age"].transform("mean")}, inplace = True)

### Feature Engineering
We'll create a new column FamilySize. There are 2 columns related to family size, parch indicates parent or children number, Sibsp indicates sibling and spouse number.

Take one name 'Asplund' as example, we can see that total family size is 7(Parch + SibSp + 1), and each family member has same Fare, which means the Fare is for the whole group. So family size will be an important feature to predict Fare. There're only 4 Asplunds out of 7 in the dataset becasue the dataset is only a subset of all passengers.

In [12]:
# df_titanic.Name.str.contains('Asplund')

new_df.loc[new_df["Name"].str.contains("Asplund")]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
25,26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...",female,38.0,1,5,347077,$31.3875,,S,MRS.
182,183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9.0,4,2,347077,$31.3875,,S,MASTER.
233,234,1,3,"Asplund, Miss. Lillian Gertrud",female,5.0,4,2,347077,$31.3875,,S,MISS.
261,262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3.0,4,2,347077,$31.3875,,S,MASTER.


##### Create column 'FamilySize'
FamilySize = Parch + SibSp + 1

In [13]:
#  df_titanic.Parch + df_titanic.SibSp + 1

new_df["FamilySize"] = new_df["Parch"] + new_df["SibSp"] + 1

##### Contruct a regresson on Fare
Construct regression model with statsmodels.

Pick Pclass, Embarked, FamilySize as independent variables.

In [14]:
#result =smf.ols("Fare ~ C(Pclass) + C(Embarked) + FamilySize", data=df_titanic).fit()
#result.summary()

import statsmodels.formula.api as smf

In [15]:
# Convert elements under fare column to continuous data type
new_df["Fare"] = new_df["Fare"].str.removeprefix("$").astype("float")

result = smf.ols("Fare ~ C(Pclass) + C(Embarked) + FamilySize", data = new_df).fit()

In [16]:
result.summary()

0,1,2,3
Dep. Variable:,Fare,R-squared:,0.427
Model:,OLS,Adj. R-squared:,0.424
Method:,Least Squares,F-statistic:,131.9
Date:,"Sun, 12 May 2024",Prob (F-statistic):,1.9199999999999998e-104
Time:,23:30:25,Log-Likelihood:,-4495.8
No. Observations:,891,AIC:,9004.0
Df Residuals:,885,BIC:,9032.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,79.2989,3.543,22.381,0.000,72.345,86.253
C(Pclass)[T.2],-59.0955,3.921,-15.073,0.000,-66.790,-51.401
C(Pclass)[T.3],-68.8790,3.253,-21.174,0.000,-75.264,-62.494
C(Embarked)[T.Q],-11.8147,5.446,-2.169,0.030,-22.504,-1.126
C(Embarked)[T.S],-14.9202,3.414,-4.371,0.000,-21.620,-8.220
FamilySize,7.8256,0.789,9.919,0.000,6.277,9.374

0,1,2,3
Omnibus:,1043.506,Durbin-Watson:,2.04
Prob(Omnibus):,0.0,Jarque-Bera (JB):,118621.734
Skew:,5.718,Prob(JB):,0.0
Kurtosis:,58.357,Cond. No.,13.4


In [17]:
def conclusion():
    """
Returns conclusion based on the output of a linear regression model on "Fare"  
    """
    print("conclusion".upper().center(60, " "))
    print(f'{" "}\nThe coefficient of determination (R-squared) is 0.427, indicating that approximately 42.7% of the variance in the')
    print(f'dependent variable ("Fare") is explained by the independent variables in the model,')
    print(f'leaving 57.3% of the variability of the dependent variable not explained.\n{" "}')
    print(f'The coefficient for "Pclass" (T.2) is -59.0955, indicating that, holding other variables constant,')
    print(f'passengers in class 2 paid approximately $59.10 less fare compared to passengers in class 1\n{" "}')
    print(f'The coefficient for "Pclass" (T.3) is -68.8790, indicating that, holding other variables constant,')
    print(f'passengers in class 3 paid approximately $68.90 less fare compared to passengers in class 1\n{" "}')
    print(f'The coefficient for family size is 7.8256, indicating that, for any additional member to a family,')
    print(f'there is approximately $7.83 increase in fare holding all other variables constant.\n{" "}')
    print(f'Due to 57.3% variability of the dependent variable not explained, other predictor variables')
    print(f'would be needed to sufficiently explained the dependent variable')


In [18]:
conclusion()

                         CONCLUSION                         
 
The coefficient of determination (R-squared) is 0.427, indicating that approximately 42.7% of the variance in the
dependent variable ("Fare") is explained by the independent variables in the model,
leaving 57.3% of the variability of the dependent variable not explained.
 
The coefficient for "Pclass" (T.2) is -59.0955, indicating that, holding other variables constant,
passengers in class 2 paid approximately $59.10 less fare compared to passengers in class 1
 
The coefficient for "Pclass" (T.3) is -68.8790, indicating that, holding other variables constant,
passengers in class 3 paid approximately $68.90 less fare compared to passengers in class 1
 
The coefficient for family size is 7.8256, indicating that, for any additional member to a family,
there is approximately $7.83 increase in fare holding all other variables constant.
 
Due to 57.3% variability of the dependent variable not explained, other predictor variables
