In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

In [2]:
titanic.head(1)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False


In [3]:
# Crosstab: Survival vs. Class
survival_class =\
pd.crosstab(index=titanic['class']
            , columns=titanic['survived']
            , margins=True
            , margins_name='Total')

survival_class

survived,0,1,Total
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,80,136,216
Second,97,87,184
Third,372,119,491
Total,549,342,891


In [4]:
titanic.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [5]:
# Crosstab: Survival vs. Class and Gender
survival_gender_class =\
pd.crosstab(index=[titanic['class']       
                   , titanic['sex']]
            , columns=titanic['survived']                      
            , margins=True                              
            , margins_name='Total')

survival_gender_class

Unnamed: 0_level_0,survived,0,1,Total
class,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,female,3,91,94
First,male,77,45,122
Second,female,6,70,76
Second,male,91,17,108
Third,female,72,72,144
Third,male,300,47,347
Total,,549,342,891


In [6]:
# Normalize by columns
normalized =\
pd.crosstab(index=titanic['class'] 
            , columns=titanic['survived'] 
            , normalize='columns')
normalized

survived,0,1
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,0.145719,0.397661
Second,0.176685,0.254386
Third,0.677596,0.347953


In [7]:
# Normalize by rows
normalized_row =\
pd.crosstab(index=titanic['class']
            , columns=titanic['survived']
            , normalize='index')

normalized_row

survived,0,1
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,0.37037,0.62963
Second,0.527174,0.472826
Third,0.757637,0.242363


In [8]:
from scipy.stats import fisher_exact

# Create 2x2 table directly from crosstab
table = pd.crosstab(titanic['sex']
                    , titanic['survived']
                   ).iloc[:2, :2]

# Apply Fisher's Exact Test
odds_ratio, p = fisher_exact(table)

print("Odds Ratio:", odds_ratio)
print("p-value:", p)

Odds Ratio: 0.08096731594585672
p-value: 6.463921564583144e-60


In [9]:
from scipy.stats import chi2_contingency

# Load data
titanic = sns.load_dataset('titanic')

# Apply chi-squared test directly to the crosstab of two columns
chi2, p, dof, _ =\
chi2_contingency(pd.crosstab(titanic['sex']
                             , titanic['survived']))

print("Chi-Squared Statistic:", chi2)
print("p-value:", p)

Chi-Squared Statistic: 260.71702016732104
p-value: 1.197357062775565e-58


In [10]:
import statsmodels.stats.api as sms

# Load data
penguins = sns.load_dataset('penguins')

# Drop NA values
penguins = penguins.dropna(
    subset=['species'
            , 'flipper_length_mm'])

# Apply t-test directly to two groups
t_stat, p, df = sms.ttest_ind(
    penguins[penguins['species'] == 'Adelie']\
    ['flipper_length_mm']
    , penguins[penguins['species'] == 'Gentoo']\
    ['flipper_length_mm']
)

print("T-statistic:", t_stat)
print("p-value:", p)

T-statistic: -34.41495797176763
p-value: 4.211309078100972e-101


In [12]:
from scipy.stats import pearsonr

# Drop NA
data = penguins.dropna(subset=['bill_length_mm'
                               , 'flipper_length_mm'])

# Pearson correlation
r_p, p = pearsonr(data['bill_length_mm']
                         , data['flipper_length_mm'])

print("Pearson corr. coef.:", r_p)
print("p-value:", p)

Pearson corr. coef.: 0.6561813407464278
p-value: 1.7439736176207624e-43


In [13]:
from scipy.stats import spearmanr

# Spearman correlation
r_s, p = spearmanr(data['bill_length_mm']
                          , data['flipper_length_mm'])

print("Spearman corr. coef.:", r_s)
print("p-value:", p)

Spearman corr. coef.: 0.6727719416255543
p-value: 2.0669356276079203e-46


In [14]:
from sklearn.datasets import load_iris
import numpy as np

# Load Iris dataset
iris = load_iris(as_frame=True)
iris_df = iris.frame

# Categorize petal length into bins
bins = [0, 2, 4, 6, 8]
labels = ['Very Short', 'Short'
          , 'Medium', 'Long']

iris_df['petal_length_category'] =\
pd.cut(iris_df['petal length (cm)']
       , bins=bins
       , labels=labels)

# Crosstab: Species vs. Petal Length Category
species_petal =\
pd.crosstab(index=iris_df['target']
            , columns=iris_df['petal_length_category'], margins=True, margins_name='Total')

species_petal.T

target,0,1,2,Total
petal_length_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Very Short,50,0,0,50
Short,0,16,0,16
Medium,0,34,41,75
Long,0,0,9,9
Total,50,50,50,150
