In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

def info(obj, search=''):
    obj_contents = [i for i in dir(obj) if '__' not in i]
    if obj_contents:
        search_results = np.array([i for i in obj_contents if search.lower() in i.lower()])
        if search_results.size <= 0:
            print('No mathcing results')
            return np.array([i for i in obj_contents])
        else:
            return search_results
    else:
        return help(obj)
    

def subset_data(df, *names_like):
    return df[[col for col in df if [name for name in names_like if name in col]]]


def numeric_features(df):
    """Leverages pd.DataFrame.describe() returns numeric data only, convert column names to list"""
    return df.describe().columns.to_list()


def outliers_mask(df):
    df = df[numeric_features(df)]
    Q1, Q3 = df.quantile(0.25) ,df.quantile(0.75)
    iqr = Q3 - Q1
    return ((df < (Q1 - 1.5 * iqr)) | (df > (Q3 + 1.5 + iqr)))


def outliers_replace(df, value):
    df = df.copy(deep=True)
    df[outliers_mask(df)] = value
    return df
    
def outliers_count(df):
    return outliers_mask(df).sum()  # True=1, False=0


def outliers_percentage(df):
    total_rows = df.shape[0]
    return round(outliers_count(df) / total_rows * 100, 2)


def outliers_drop(df):
    df = df.copy(deep=True)
    df[outliers_mask(df)] = np.nan
    return df.dropna()


def outliers_describe(df):
    total_rows = df.shape[0]
    outlier_rows = outliers_drop(df).shape[0]
    print(f'''
    Total Rows: {total_rows}
    Outlier Rows: {outlier_rows}
    Overall data reduction: {round(outlier_rows / total_rows * 100, 2)} %
    
    Count Outliers:\n\n{outliers_count(df)}
    \n________________________________________________________________________
    
    Percentage Outliers:\n\n{outliers_percentage(df)}''')
    
    
def shift_column(df, column_name, loc=None):
    df = df.copy(deep=True)
    if not loc:
        # Move column to end of DataFrame
        loc = len(df.columns) -1
    column_to_move = df.pop(column_name)
    df.insert(loc, column_name)
    return df


def plot_heatmap(df):
    fig, ax = plt.subplots(figsize=(12,12))
    heatmap = sns.heatmap(df.corr(), ax=ax)
    heatmap.set_title('Correlation Heatmap', 
                      fontdict={'fontsize':12},
                      pad=12);

    
def plot_model_performance(actual, predictions, title=''):
    plt.plot(actual, predictions, '.')
    plt.plot(actual, actual, 'r')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    if title:
        plt.title(title)
    plt.show()
    

def regression_model_metrics(actual, predictions, name=''):
    mse = round(mean_squared_error(actual, predictions), 2)
    rmse = round(np.sqrt(mse), 2)
    r2 = round(r2_score(actual, predictions), 2)
    print(f'\
    {name} MSE: {mse} \n\
    {name} RMSE: {rmse} \n\
    {name} R2_score: {round(r2 * 100, 2)} %')
    

def list_correlations(df, threshold=0.6, ascending=False):
    df = df.copy(deep=True)
    df = df.corr().abs()
    np.fill_diagonal(df.values, np.nan)
    values_above_threshold = df.where(df > threshold)
    return values_above_threshold.unstack().dropna().sort_values(ascending=ascending)

In [3]:
df_train = pd.read_csv('train.csv', index_col=[0])
df_test = pd.read_csv('test.csv', index_col=[0])

df = pd.concat([df_train, df_test])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   int64  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   int64  
 6   Parch     1309 non-null   int64  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 122.7+ KB


## Data Exploration:
--- 
1. Null values:
    * Age = 263
    * Cabin = 1014 
        * 77% of Cabin data is Null values [Drop / find some utility]  
    * Fare has 1 null value in test training set
2. Text Data:
    * Name
    * Sex
    * Ticket
    * Cabin
    * Embarked
3. Numeric Data:
    * Survived
    * Pclass
    * Age
    * SibSp
    * Parch
    * Fare 
---    
#### Thoughts after running df.describe(): ####  
    * Survived mean 0.39, unbalanced Survived ratio?
    * Pclass is skewed more observations with values of 3
    * Age min 0.17 ? 
    * Age mean 30, fairly low considering max value is 80
    * SibSp, Parch and Fare is skewed to the right df.SubSp.hist()
       
#### Value Counts on objects: ####
    * Sex Double the amount of males than females on-board...
    * Name Dubplicates [Dubplicates are from test data and won't be included during model training]:  
        * Connolly, Miss. Kate
        * Kelly, Mr. James
    * Tickets lots of values dummy encoding will just make tons of unneeded features...
        * Maybe extract only the numbers and store as int?
        * What about values that don't contain any number?
    * Cabin 186 observations to dummy encode? 
        * Is there a better solution
    * Embarked has 3 Features...
        * Good for dummy encoding however 'S' has triple the amount of occurences than other 2 features combined

In [161]:
df_clean = df.copy()

# Convert sex from object to integer
df_clean.Sex.replace('male', 1, inplace=True)
df_clean.Sex.replace('female', 0, inplace=True)

df_clean['Title'] = [ i.rsplit(', ')[1].rsplit('.')[0] for i in df_clean.Name.values]  # Extract titles

uncommon_titles = 'Rev,Dr,Col,Major,Mlle,Ms,Sir,Capt,Mme,Jonkheer,Lady,the Countess,Don,Dona'.split(',')  
df_clean['Title'] = ['uncommon' if i in uncommon_titles else i for i in df_clean.Title.values]  # Group uncommon titles

# df_clean.drop('Name', axis=1, inplace=True)  # No more 'value' to be extracted from name
df_clean.drop('Cabin', axis=1, inplace=True)  # 77% NaN anyway we can impute data based on fare and title?

null_age = df_clean[df_clean.Age.isnull()].copy()

#  TODO: Impute median age based on Pclass and Sex
# for pclass in df_clean.Pclass.unique():
#         for gender in df_clean.Sex.unique():
#             subset_sex_pclass = df_clean[(df_clean.Sex == gender) & (df_clean.Pclass == pclass)]
#             impute_value = round(subset_sex_pclass.Age.median(), 1)
            
#             for index, row in subset_sex_pclass.iterrows():
#                 df_clean.loc[index, 'Age'] = impute_value
                


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6,0.0,3,"Moran, Mr. James",1,,0,0,330877,8.4583,Q,Mr
18,1.0,2,"Williams, Mr. Charles Eugene",1,,0,0,244373,13.0000,S,Mr
20,1.0,3,"Masselmani, Mrs. Fatima",0,,0,0,2649,7.2250,C,Mrs
27,0.0,3,"Emir, Mr. Farred Chehab",1,,0,0,2631,7.2250,C,Mr
29,1.0,3,"O'Dwyer, Miss. Ellen ""Nellie""",0,,0,0,330959,7.8792,Q,Miss
...,...,...,...,...,...,...,...,...,...,...,...
1300,,3,"Riordan, Miss. Johanna Hannah""""",0,,0,0,334915,7.7208,Q,Miss
1302,,3,"Naughton, Miss. Hannah",0,,0,0,365237,7.7500,Q,Miss
1305,,3,"Spector, Mr. Woolf",1,,0,0,A.5. 3236,8.0500,S,Mr
1308,,3,"Ware, Mr. Frederick",1,,0,0,359309,8.0500,S,Mr
