In [2]:
import pandas as pd
import seaborn as sns

# Load Titanic dataset
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df.info()                    


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [4]:
df.describe()                 # Summary stats for numeric


Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df.describe(include='object')# Summary for categoricals


Unnamed: 0,sex,embarked,who,embark_town,alive
count,891,889,891,889,891
unique,2,3,3,3,2
top,male,S,man,Southampton,no
freq,577,644,537,644,549


In [6]:
help(df.describe)

Help on method describe in module pandas.core.generic:

describe(percentiles=None, include=None, exclude=None) -> 'Self' method of pandas.core.frame.DataFrame instance
    Generate descriptive statistics.
    
    Descriptive statistics include those that summarize the central
    tendency, dispersion and shape of a
    dataset's distribution, excluding ``NaN`` values.
    
    Analyzes both numeric and object series, as well
    as ``DataFrame`` column sets of mixed data types. The output
    will vary depending on what is provided. Refer to the notes
    below for more detail.
    
    Parameters
    ----------
    percentiles : list-like of numbers, optional
        The percentiles to include in the output. All should
        fall between 0 and 1. The default is
        ``[.25, .5, .75]``, which returns the 25th, 50th, and
        75th percentiles.
    include : 'all', list-like of dtypes or None (default), optional
        A white list of data types to include in the result. Ignore

In [7]:
df['age'].median()

np.float64(28.0)

In [8]:
df.select_dtypes(include=['object','category']).head()

Unnamed: 0,sex,embarked,class,who,deck,embark_town,alive
0,male,S,Third,man,,Southampton,no
1,female,C,First,woman,C,Cherbourg,yes
2,female,S,Third,woman,,Southampton,yes
3,female,S,First,woman,C,Southampton,yes
4,male,S,Third,man,,Southampton,no


In [9]:
def get_col_tyoes(df):
    string_cols = df.select_dtypes(include=['object','category']).columns.tolist()
    num_cols = [i for i in df.columns if i not in string_cols]
    return string_cols,num_cols

In [10]:
def replace_nulls(df,string_cols,num_cols):
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    for i in string_cols:

        df[i] = df[i].fillna(df[i].mode()[0])
    return df

In [11]:
def features_eng(df):
    # Create a 'family_size' column
    df['family_size'] = df['sibsp'] + df['parch'] + 1

    # Binary flag: was the person alone?
    df['is_alone'] = (df['family_size'] == 1).astype(int)
    
    

In [12]:
str_cols,num_cols=get_col_tyoes(df)


In [13]:
df_clean = replace_nulls(df,str_cols,num_cols)

In [14]:
features_eng(df_clean)

In [15]:

# Sort correlations with survival
df_clean.corr(numeric_only=True)['survived'].sort_values(ascending=False)


survived       1.000000
fare           0.257307
parch          0.081629
family_size    0.016639
sibsp         -0.035322
age           -0.064910
alone         -0.203367
is_alone      -0.203367
pclass        -0.338481
adult_male    -0.557080
Name: survived, dtype: float64

In [16]:
# Survival rate pivoted by sex and class
pd.pivot_table(df, values='survived', index='sex', columns='pclass')


pclass,1,2,3
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [17]:
encoded_df = pd.get_dummies(df_clean, columns=['sex', 'embarked'], drop_first=True)


In [19]:
help(pd.pivot_table)

Help on function pivot_table in module pandas.core.reshape.pivot:

pivot_table(data: 'DataFrame', values=None, index=None, columns=None, aggfunc: 'AggFuncType' = 'mean', fill_value=None, margins: 'bool' = False, dropna: 'bool' = True, margins_name: 'Hashable' = 'All', observed: 'bool | lib.NoDefault' = <no_default>, sort: 'bool' = True) -> 'DataFrame'
    Create a spreadsheet-style pivot table as a DataFrame.
    
    The levels in the pivot table will be stored in MultiIndex objects
    (hierarchical indexes) on the index and columns of the result DataFrame.
    
    Parameters
    ----------
    data : DataFrame
    values : list-like or scalar, optional
        Column or columns to aggregate.
    index : column, Grouper, array, or list of the previous
        Keys to group by on the pivot table index. If a list is passed,
        it can contain any of the other types (except list). If an array is
        passed, it must be the same length as the data and will be used in
        the 

In [23]:
pivot_counts = pd.pivot_table(df_clean,values='survived', index='sex', columns='pclass',aggfunc='count')
pivot_counts

pclass,1,2,3
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,94,76,144
male,122,108,347


In [34]:
s = 'vishnu'
s[::-1]

'unhsiv'

In [43]:
s[4::-2]

'nsv'

In [24]:
pivot_counts.div(pivot_counts.sum(axis=0),

pclass
1    216
2    184
3    491
dtype: int64