### Import Modules

In [None]:
# import modules needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Import Dataset

In [None]:
# Load project dataset and store in a variable
titanic_df = pd.read_csv ("./train.csv", delimiter=",")

##### Column Descriptions

Survived: Survived (0 = No; 1 = Yes) <br>
Pclass: Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) <br>
Name: Passenger Name <br>
Sex: Passenger Sex <br>
Age: Passenger Age <br>
SibSp: Number of Siblings/Spouses Aboard <br>
Parch: Number of Parents/Children Aboard <br>
Fare: Passenger Fare <br>
Cabin: Cabin <br>
Embarked: Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

### Explore Data

In [None]:
titanic_df

In [None]:
print("Dataset set contains {} rows and {} columns".format(titanic_df.shape[0],titanic_df.shape[1]))

In [None]:
titanic_df[titanic_df.duplicated()] #check for duplicate rows returns no duplicate rows in the dataset

In [None]:
titanic_df.info()   #returns infomation of columns such as; data type, sum of non-null rows per column and column headers

In [None]:
titanic_df.describe()   # returns descriptive statistics for columns with numerical values

 Relationship Between Columns

In [None]:
titanic_df.iloc[:,[1,2,3,6,7,8,10]].corr()

In [None]:
sns.heatmap(titanic_df.iloc[:,[1,2,3,6,7,8,10]].corr(), cmap="YlGnBu", annot=True)

### Data Cleaning

In [None]:
titanic_df.isnull().sum() # some columns are noticed to have null values
                            # Some will be dropped, and in some rows will be replaced by the average values their column
                            # Columns not needed will also be dropped

Drop unnamed column

In [None]:
titanic_df.drop('Unnamed: 0', inplace=True, axis=1)

Drop cabin column

In [None]:
titanic_df.drop('Cabin', inplace=True, axis=1)

Replace the null age rows with the mean age

In [None]:
# calculate the mean age
avg_age= np.round(titanic_df.Age.mean(),2)
print(avg_age)

# Fill the missing age values with the average age value calculated
titanic_df.Age.fillna(avg_age, inplace=True)

Drop null rows under Embarked column

In [None]:
titanic_df.dropna(inplace=True)

Rename 'Survived' column to 'Surival Status

In [None]:
titanic_df.rename(columns={"Survived": "Survival"}, inplace= "True")

In [None]:
titanic_df.isnull().sum()   #check to see is there are any null values left returns none

### Check Correlation of Continuous Data

In [None]:
titanic_df.iloc[:,[0,1,2,5,6,7,9]].corr()

In [None]:
sns.heatmap(titanic_df.iloc[:,[0,1,2,5,6,7,9]].corr(), cmap="YlGnBu", annot=True)

### Data Visualization

In [None]:
#survive_barchart = titanic_df.groupby(["Survival Status"])["PassengerId"].count().plot(kind="bar")

In [None]:
copy = titanic_df.groupby(["Survival"], as_index= False)["PassengerId"].count().sort_values(by="Survival", ascending=False)
ax = sns.barplot(x="Survival", y= "PassengerId", data= copy)
ax.bar_label(ax.containers[0], fmt = '%.1f')
plt.title("Bar Chart of Survivors Against Non-survivors")

copy

Group by different passenger classes

In [None]:
psg_class = titanic_df.groupby("Pclass")["Pclass"].count()
psg_class

Plot a pie chart showing different passenger classes

In [None]:
plt.figure(figsize=(7,7))
plt.title("Paasengers by Class")
plt.pie(psg_class.values, labels=["Class 1", "Class 2", "Class 3"],
        autopct='%1.1f%%', textprops={'fontsize':13})
plt.show()

Group by the sex of passengers

In [None]:
psg_sex = titanic_df.groupby("Sex")["Sex"].count()
psg_sex

Plot pie chart showing different passenger genders

In [None]:
plt.figure(figsize=(7,7))
plt.title('Passengers by Gender')
plt.pie(psg_sex.values, labels=["Male", "Female"],
        autopct='%1.1f%%', textprops={'fontsize':13})
plt.show()

Group by port of embarkation

In [None]:
psg_embark = titanic_df.groupby("Embarked")["Embarked"].count()
psg_embark

Plot pie chart showing different passengers' port of embarkation

In [None]:
plt.figure(figsize=(7,7))
plt.title('Passengers by Port of Embarkation')
plt.pie(psg_embark.values, labels=["Cherbourg", "Queenstown", "Southampton"],
        autopct='%1.1f%%', textprops={'fontsize':13})
plt.show()

Check effect of sex on survival

In [None]:
survival_sex = titanic_df.groupby('Sex')['Survival'].sum()
survival_sex    #returns number of survivors according to sex 

Plot bar chart showing survivors by sex

In [None]:
plt.figure(figsize=(7,9))

plt.bar(survival_sex.index, survival_sex.values)

plt.title('Survivors by Sex')

for i, value in enumerate(survival_sex.values):
    plt.text(i, value-20, str(value), fontsize=12, color='white',
             horizontalalignment='center', verticalalignment='center')
    
plt.show()

Check effect of passenger class on survival

In [None]:
psg_survival = titanic_df.groupby('Pclass')['Survival'].sum()
psg_survival

In [None]:
percent_psg_survival = titanic_df.groupby(['Pclass'])['Survival'].sum()/titanic_df.groupby(['Pclass'])['Survival'].count()*100
percent_psg_survival

In [None]:
percent_psg_survival.index = ["Class 1","Class 2","Class 3"]

In [None]:
plt.figure(figsize=(7,9))

plt.bar(percent_psg_survival.index, percent_psg_survival.values)

plt.title('Percentage of Survivors by Passenger Class')

for i, value in enumerate(np.round(percent_psg_survival.values,2)):
    plt.text(i, value+1, str(value), fontsize=12, color='black',
             horizontalalignment='left', verticalalignment='baseline')
    
plt.show()

In [None]:
#tips = sns.load_dataset("tips")
#sns.catplot(x="day", y="total_bill", data=tips)

sns.catplot(x="Survival", hue="Sex", col="Pclass", kind="count", data=titanic_df )
plt.title('Class wise segregation of passengers based on sex and survival status', fontsize=16, y= 1.1, loc="left")

for i, value in enumerate(survival_sex.values):
    plt.text(i, value+1, str(value), fontsize=12, color='black',
             horizontalalignment='center', verticalalignment='center')
    
plt.show()

### Insights

The dataset analyzed was only a sample of the total population of passengers aboard The Titanic. <br>
Factors that showed to influence the survival chances are sex and class of the passenger. <br>
Female passengers had higher of survival than their male counterparts. <br>
First class passengers had the highest chances of survival of all three classes of passengers. <br>
Though the third class passengers recorded more survivors than the second class passengers, percentage of survivors was greater in second class at 42.28% against third class which was at 24.24%.<br>
This can be attributed to the high number of passengers in the third class at 491, and second class passengers at 184.