# Introduction to Quality Data & Engineering with Python
## Lecture 4 - EDA, Data Cleaning & Preparation Tutorial

### Task 1: Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np

### Task 2: Load the Dataset

https://tubcloud.tu-berlin.de/s/y4FtN6zg8oTKZRJ/download/titanic.csv

In [2]:
df = pd.read_csv("https://tubcloud.tu-berlin.de/s/y4FtN6zg8oTKZRJ/download/titanic.csv")

| Feature     | Description                                        |
|-------------|----------------------------------------------------|
| survived    | Whether the passenger survived (0 = No, 1 = Yes)   |
| pclass      | Ticket class (1 = First, 2 = Second, 3 = Third)    |
| sex         | Gender of the passenger (male, female)             |
| age         | Age of the passenger in years                      |
| sibsp       | Number of siblings/spouses aboard the Titanic      |
| parch       | Number of parents/children aboard the Titanic      |
| fare        | Fare paid by the passenger                          |
| embarked    | Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton) |
| class       | Ticket class (as a string: First, Second, Third)   |
| who         | Person's title (man, woman, child)                 |
| deck        | Deck where the passenger's cabin was located       |
| embark_town | Town where the passenger embarked                  |
| alive       | Whether the passenger survived (yes, no)            |
| alone       | Whether the passenger traveled alone (True, False) |


### Task 3: Explore the Dataset

### Task 3.1: Display the first 5 rows of your DataFrame

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Task 3.1: Display the last 5 rows of your DataFrame

In [4]:
df.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


### Task 3.2: Display summary statistics of numerical columns

In [5]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


### Task 3.3: Check data types and missing values

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


### Task 3.4: Check the number of women and men in the dataset

In [13]:
df.value_counts("sex")

sex
male      577
female    314
dtype: int64

### Task 3.5: Check the number of unique values in the 'sex' column of the DataFrame

In [19]:
df["sex"].unique()

array(['male', 'female'], dtype=object)

### Task 3.6: Show all the passengers' data whose embarked value is not 'S'

In [20]:
df[df["embarked"] != "S"]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
16,0,3,male,2.0,4,1,29.1250,Q,Third,child,False,,Queenstown,no,False
19,1,3,female,,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


### Task 3.7: Show all information for passengers who are women and under 30 years old

In [22]:
df[(df["sex"] != "F") & ((df["age"] < 30))]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
7,0,3,male,2.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,0,2,male,28.0,0,0,10.5000,S,Second,man,True,,Southampton,no,True
884,0,3,male,25.0,0,0,7.0500,S,Third,man,True,,Southampton,no,True
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


### Task 4: Data Cleaning

### Task 4.1: Check for missing values

In [24]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

### Task 4.2: Handle missing values in Age column

hint: Fill missing values with median.

In [27]:
df["age"].fillna(df["age"].mean(), inplace=True)

In [28]:
df.isna().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

### Task 4.3: Handle missing values in "embarked" column

hint: Fill missing values with mode.

In [29]:
df["embarked"].fillna(df["embarked"].mode(), inplace=True)

In [30]:
df.isna().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

### Task 4.4: Handle missing values in "deck" column

hint: Fill missing values with mode.

In [32]:
df["deck"].fillna(df["deck"].mode(), inplace=True)

In [33]:
df.isna().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           687
embark_town      2
alive            0
alone            0
dtype: int64

### Task 4.5: Drop the rest of NaN rows.

### Task 4.7: Change data type of 'embarked' column to category

### Task 4.6: Drop "who" column

### Task 4.8: Detect outliers in "Fare" column using box plot

### Task 4.9: Remove outliers using IQR method

### Task 5: Data Visualization

### Task 5.1: Create a bar plot for class vs. fare

### Task 5.2: Create a bar plot for "survived"

### Task 5.3: Create histogram for 'Age'

### Task 5.4: Create a boxplot for using "pclass" and "age"

### Task 5.5: Create a heatmap for numerical columns

### Task 6: Data Preparation

### Task 6.1: Create a new feature 'FamilySize' by combining 'SibSp' and 'Parch'

### Task 6.2: Convert DataFrame column names to lowercase