## __1. Data Overview Class__

In [1]:
# titanic-survival-project-classes.ipynb

import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt

# Data Overview Class
class DataOverview:
    def __init__(self, file_path):
        self.dataframe = pd.read_csv(file_path)
    
    def show_info(self):
        return self.dataframe.info()

    def show_head(self, n=5):
        return self.dataframe.head(n)
    
    def show_tail(self, n=5):
        return self.dataframe.tail(n)

    def show_description(self):
        return self.dataframe.describe()

    def show_missing_values(self):
        msno.bar(self.dataframe, color='skyblue', figsize=(8, 6))
        plt.title('Missing Values Count', fontsize=12)  # Adjust title font size
        plt.xlabel('Columns', fontsize=10)  # Adjust x-axis label font size
        plt.ylabel('Count of Missing Values', fontsize=10)  # Adjust y-axis label font size
        plt.xticks(rotation=45, fontsize=10)  # Rotate x-axis labels and adjust font size
        plt.yticks(fontsize=10)  # Adjust font size of y-axis labels
        plt.show()


    def show_duplicates(self):
        duplicates = self.dataframe[self.dataframe.duplicated()]
        if not duplicates.empty:
            print(f"Number of duplicate rows: {duplicates.shape[0]}")
            return duplicates
        else:
            print("No duplicate rows found.")
            return None

    def show_unique_values(self):
        unique_values = self.dataframe.nunique()
        return unique_values
    
    def show_null_values(self):
        null_counts = self.dataframe.isnull().sum()
        return null_counts

In [2]:
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

class DataCleaning:
    def __init__(self, train_file_path, test_file_path):
        self.train_data = pd.read_csv(train_file_path)
        self.test_data = pd.read_csv(test_file_path)

    def drop_columns(self):
        # Drop 'Cabin' and 'Ticket' columns for both train and test datasets
        self.train_data.drop(['Cabin', 'Ticket'], axis=1, inplace=True)
        self.test_data.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

    def impute_missing_values(self):
        # Define imputers for 'Fare', 'Embarked', and 'Age' columns
        fare_imputer = SimpleImputer(strategy='median')
        embarked_imputer = SimpleImputer(strategy='most_frequent')
        age_imputer = SimpleImputer(strategy='median')

        # Impute missing values in the training dataset
        if self.train_data['Fare'].isnull().any():
            self.train_data['Fare'] = fare_imputer.fit_transform(self.train_data[['Fare']])
        if self.train_data['Embarked'].isnull().any():
            self.train_data['Embarked'] = embarked_imputer.fit_transform(self.train_data[['Embarked']])
        if self.train_data['Age'].isnull().any():
            self.train_data['Age'] = age_imputer.fit_transform(self.train_data[['Age']])

        # Impute missing values in the test dataset
        if self.test_data['Fare'].isnull().any():
            self.test_data['Fare'] = fare_imputer.transform(self.test_data[['Fare']])
        if self.test_data['Embarked'].isnull().any():
            self.test_data['Embarked'] = embarked_imputer.transform(self.test_data[['Embarked']])
        if self.test_data['Age'].isnull().any():
            self.test_data['Age'] = age_imputer.transform(self.test_data[['Age']])

    def get_cleaned_data(self):
        return self.train_data, self.test_data
