### Data Ingestion

In [2]:
# import necessary library
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import seaborn as sns

# load the dataset into pandas Dataframe
df = pd.read_csv("possum.csv")
df.head(2)

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0


### Preliminary Data Analysis(PDA)

In [3]:
print("Top 5 information of the dataset:", df.head())
print('\n')
print("Bottom 5 information of the dataset:", df.tail())
print('\n')
print(f"More info of the dataset: {df.info()}")
print('\n')
print(f"This is the shape of the dataset: {df.shape}")
print('\n')
print("Descriptive statistics based on the numerical features of the dataset:", df.describe())
print('\n')
print("Descriptive statistics of the dataset including the categorical features:", df.describe(include = 'all'))

Top 5 information of the dataset:    case  site  Pop sex  age  hdlngth  skullw  totlngth  taill  footlgth  \
0     1     1  Vic   m  8.0     94.1    60.4      89.0   36.0      74.5   
1     2     1  Vic   f  6.0     92.5    57.6      91.5   36.5      72.5   
2     3     1  Vic   f  6.0     94.0    60.0      95.5   39.0      75.4   
3     4     1  Vic   f  6.0     93.2    57.1      92.0   38.0      76.1   
4     5     1  Vic   f  2.0     91.5    56.3      85.5   36.0      71.0   

   earconch   eye  chest  belly  
0      54.5  15.2   28.0   36.0  
1      51.2  16.0   28.5   33.0  
2      51.9  15.5   30.0   34.0  
3      52.2  15.2   28.0   34.0  
4      53.2  15.1   28.5   33.0  


Bottom 5 information of the dataset:      case  site    Pop sex  age  hdlngth  skullw  totlngth  taill  footlgth  \
99    100     7  other   m  1.0     89.5    56.0      81.5   36.5      66.0   
100   101     7  other   m  1.0     88.6    54.7      82.5   39.0      64.4   
101   102     7  other   f  6.0    

In [4]:
# To check all the features in the dataset
print(f"This are all the features of the dataset:\n {df.columns}")


This are all the features of the dataset:
 Index(['case', 'site', 'Pop', 'sex', 'age', 'hdlngth', 'skullw', 'totlngth',
       'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly'],
      dtype='object')


In [5]:
# To check total number of missing data in each column
df.isna().sum()

case        0
site        0
Pop         0
sex         0
age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [6]:
# Using median to handle the missing data for the age and footlength of the possum
# This is because median is robust against outliers compare to mean 

# find the median values of age and footlength features of the possum
age_mid = df['age'].median()
print("The median age of the possum is:", float(age_mid))
print('\n')
flgth_mid = df['footlgth'].median()
print("The median footlgth of the possum is:", float(flgth_mid))


The median age of the possum is: 3.0


The median footlgth of the possum is: 68.0


In [7]:
# Using median to handle the missing data for the age and footlength of the possum by median imputation
df['age'] = df['age'].fillna(age_mid)
df['footlgth'] = df['age'].fillna(flgth_mid)


In [8]:
#check the missing value again to see if it has been filled
df.isna().sum()

case        0
site        0
Pop         0
sex         0
age         0
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    0
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [9]:
# To check if there is duplicate in the dataset
df.duplicated().sum()
int(df.duplicated().sum())


0

Since the duplicate return zero this means that there is no duplicate in the dataset

### Statistical Analysis

Measure of Center

In [10]:
# List of numerical features
numerical_features = ['age', 'hdlngth', 'skullw', 'totlngth', 'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly']
# Define units for numerical features
units = {
    'age': 'years',
    'hdlngth': 'cm',
    'skullw': 'kg',
    'totlngth': 'cm',
    'taill': 'cm',
    'footlgth': 'cm',
    'earconch': 'cm',
    'eye': 'cm',
    'chest': 'cm',
    'belly': 'cm'
}
# Compute mean, median, and mode for each numerical feature using a for loop
for col in numerical_features:
    mean_val = df[col].mean()
    median_val = df[col].median()
    mode_val = df[col].mode().iloc[0]
    unit = units[col]
    print(f"\nMeasure of Center for {col} in {unit}:")
    print(f"  Mean: {mean_val:.2f} {unit}")
    print(f"  Median: {median_val:.2f} {unit}")
    print(f"  Mode: {mode_val:.2f} {unit}")


print('\n')

# List of categorical feature
categorical_features = ['site', 'Pop', 'sex']
print("\nFrequency Analysis for Categorical Features:")

# Compute mean, median, and mode for each numerical feature using a for loop
for col in categorical_features:
    print(f"\nUnique value for {col}:\n", df[col].unique())
    print(f"\nFrequency Distribution for {col}:\n", df[col].value_counts())
   


Measure of Center for age in years:
  Mean: 3.82 years
  Median: 3.00 years
  Mode: 3.00 years

Measure of Center for hdlngth in cm:
  Mean: 92.60 cm
  Median: 92.80 cm
  Mode: 93.30 cm

Measure of Center for skullw in kg:
  Mean: 56.88 kg
  Median: 56.35 kg
  Mode: 57.60 kg

Measure of Center for totlngth in cm:
  Mean: 87.09 cm
  Median: 88.00 cm
  Mode: 89.00 cm

Measure of Center for taill in cm:
  Mean: 37.01 cm
  Median: 37.00 cm
  Mode: 38.00 cm

Measure of Center for footlgth in cm:
  Mean: 3.82 cm
  Median: 3.00 cm
  Mode: 3.00 cm

Measure of Center for earconch in cm:
  Mean: 48.13 cm
  Median: 46.80 cm
  Mode: 44.90 cm

Measure of Center for eye in cm:
  Mean: 15.05 cm
  Median: 14.90 cm
  Mode: 14.40 cm

Measure of Center for chest in cm:
  Mean: 27.00 cm
  Median: 27.00 cm
  Mode: 28.00 cm

Measure of Center for belly in cm:
  Mean: 32.59 cm
  Median: 32.50 cm
  Mode: 32.00 cm



Frequency Analysis for Categorical Features:

Unique value for site:
 [1 2 3 4 5 6 7]

Freque

Measure of Spread

In [11]:

# Compute variance, standard deviation, Range for each numerical feature using a for loop
for col in numerical_features:
    min_val = df[col].min()
    max_val = df[col].max()
    range_value  = df[col].max()- df[col].min()
    variance_val = df[col].var()
    std_val = df[col].std()
    q25 = df[col].quantile(0.25)
    q50 = df[col].quantile(0.50)  
    q75 = df[col].quantile(0.75)
    iqr_val = q75 - q25
    skewness_val = skew(df[col])
    kurtosis_val = kurtosis(df[col])
    unit = units[col]
    print(f"\nMeasure of Spread for {col} in {unit}:")
    print(f"  Minimum : {min_val:.2f} {unit}")
    print(f"  Maximum: {max_val:.2f} {unit}")
    print(f"  Range: {range_value:.2f} {unit}")
    print(f"  Variance: {variance_val:.2f} {unit}")
    print(f"  standard deviation: {std_val:.2f} {unit}")
    print(f"  25th Percentile (Q1): {q25:.2f} {unit}")
    print(f"  50th Percentile (Median/Q2): {q50:.2f} {unit}")
    print(f"  75th Percentile (Q3): {q75:.2f} {unit}")
    print(f"  IQR: {iqr_val:.2f} {unit}")
    print(f"  skewness: {skewness_val:.2f} {unit}")
    print(f"  Kurtosis: {kurtosis_val:.2f} {unit}")


Measure of Spread for age in years:
  Minimum : 1.00 years
  Maximum: 9.00 years
  Range: 8.00 years
  Variance: 3.59 years
  standard deviation: 1.89 years
  25th Percentile (Q1): 2.75 years
  50th Percentile (Median/Q2): 3.00 years
  75th Percentile (Q3): 5.00 years
  IQR: 2.25 years
  skewness: 0.56 years
  Kurtosis: -0.27 years

Measure of Spread for hdlngth in cm:
  Minimum : 82.50 cm
  Maximum: 103.10 cm
  Range: 20.60 cm
  Variance: 12.77 cm
  standard deviation: 3.57 cm
  25th Percentile (Q1): 90.67 cm
  50th Percentile (Median/Q2): 92.80 cm
  75th Percentile (Q3): 94.72 cm
  IQR: 4.05 cm
  skewness: -0.06 cm
  Kurtosis: 0.79 cm

Measure of Spread for skullw in kg:
  Minimum : 50.00 kg
  Maximum: 68.60 kg
  Range: 18.60 kg
  Variance: 9.69 kg
  standard deviation: 3.11 kg
  25th Percentile (Q1): 54.98 kg
  50th Percentile (Median/Q2): 56.35 kg
  75th Percentile (Q3): 58.10 kg
  IQR: 3.12 kg
  skewness: 0.99 kg
  Kurtosis: 2.30 kg

Measure of Spread for totlngth in cm:
  Minimu

### Interpretation

Age


* The average age(3.82) of the possum is very close to 4 years.

* The most common age of the possum is 3 years.

* The median age of the possum is also 3 years.

* The youngest possum is 1 year, and the oldest is 9 years.

* The range (8 years) shows a wide spread of ages.

* The variance (3.59) and standard deviation (≈1.89 years) indicate that possum ages tend to vary about 2 years above or below the mean age.

* Overall, the age distribution shows moderate variability ages are not too tightly clustered but not extremely spread out either ages range from 1 to 9 years, with a moderate spread.

* The IQR of 2.25 years shows that it is very close to the middle 50% of possums are aged between 3 and 6 years.

* Slight positive skewness (0.56) since the mean(3.82) is greater than median(3.00) indicates more younger possums than older ones.

* Negative kurtosis (-0.27) means the distribution is flatter than normal (ages are more evenly spread).

Skull Weight

* The average skull weight of the possum is 56.88 kg.

* The most common skull weight of the possum is 57.60 kg

* The median skull weight of the possum is 57.60 kg.


* The lightest possum skull weighs 50 kg while the heaviest weighs 68.0 kg.

* The range of 18.6 kg suggests a large spread in skull weights.

* A standard deviation of 3.11 kg means most weights deviate about 18.6 kg from the mean.

* The varaince 9.69 indicates high variability in possum body weight.

* Weights vary widely, with an IQR of 3.12 kg.

* The median weight is 56.35 kg, showing most possums cluster in the mid-range.

* Positive skewness (0.99) means a few very heavy possums above the average.

* Kurtosis (2.30) suggests the distribution is more peaked than normal (values concentrated near the mean with some outliers).

Head length

* The average head length of the possum is 92.60cm.

* The most common head length of the possum is 93.30cm.

* The median head length of the possum is 92.80cm.


* Possums have head lengths ranging between 82.50 mm and 103.10 mm.

* The range of 20.60 cm shows a moderate spread.

* A standard deviation of 3.57 mm means most head lengths cluster fairly close to the mean.

* Overall, there is moderate variability in head size.

* Near-zero skewness (-0.06) means the data is fairly symmetric.

* Low kurtosis (0.79) indicates a distribution flatter than normal, with few outliers.


Tooth length

* tooth lengths vary between 75 mm and 96.50 mm.

* The wide range of 21.50 mm reflects significant differences in tooth size.

* A standard deviation of 4.31 mm indicates that most possums differ more than 4 mm from the average tooth length.

* This shows high variability in tooth length.

* Slight negative skew (-0.28) suggests a few larger possums.

* Negative kurtosis (-0.75) means the distribution is flatter than normal, with more evenly spread lengths

Tail length

* Tail lengths range from 32.00 cm to 43.00 cm.

* The range of 11.0 cm indicates wide variation in tail size.

* A standard deviation of 1.96 cm shows tails deviate about 2cm from the mean.

* This implies moderate to high variability in possum tail length.

* Tail lengths range from 24 to 42 cm, with most between 28 and 34 cm.

* positive skew (0.17) shows slightly more longer tails.

*Kurtosis (0.32) suggests a flat distribution with fewer extreme values.

In [29]:
#Detect and handle outliers in numeric features using IQR
def detect_outliers_iqr(x):
    Q1 = x.quantile(0.25)
    Q3 = x.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = x[(x < lower_bound) | (x > upper_bound)]
    return {
        "outliers": outliers,
        "lower_bound": lower_bound,
        "upper_bound": upper_bound
    }

# Detect and handle outliers in numeric features using Normalization / Standardization
def detect_outliers_zscore(x, threshold=3):
    z_scores = (x - x.mean()) / x.std()
    outliers = x[abs(z_scores) > threshold]
    return {
        "outliers": outliers,
        "z_scores": z_scores,
        "threshold": threshold
    }

# Detect outliers
for col in numerical_features:
    print(f"\nIQR Outlier Detection for {col}")

    iqr_result = detect_outliers_iqr(df[col].dropna())
    print("Lower Bound:", iqr_result["lower_bound"])
    print("Upper Bound:", iqr_result["upper_bound"])
    print("Outliers:\n", iqr_result["outliers"].values)
    
    print(f"\nZ-Score Outlier Detection for {col}")
    zscore_result = detect_outliers_zscore(df[col].dropna())
    print("Threshold:", zscore_result["threshold"])
    print("Outliers:\n", zscore_result["outliers"].values)



IQR Outlier Detection for age
Lower Bound: -0.625
Upper Bound: 8.375
Outliers:
 [9. 9.]

Z-Score Outlier Detection for age
Threshold: 3
Outliers:
 []

IQR Outlier Detection for hdlngth
Lower Bound: 84.6
Upper Bound: 100.79999999999998
Outliers:
 [103.1 102.5  82.5]

Z-Score Outlier Detection for hdlngth
Threshold: 3
Outliers:
 []

IQR Outlier Detection for skullw
Lower Bound: 50.2875
Upper Bound: 62.7875
Outliers:
 [67.7 63.2 63.  63.2 64.2 62.8 50.  68.6]

Z-Score Outlier Detection for skullw
Threshold: 3
Outliers:
 [67.7 68.6]

IQR Outlier Detection for totlngth
Lower Bound: 75.0
Upper Bound: 99.0
Outliers:
 []

Z-Score Outlier Detection for totlngth
Threshold: 3
Outliers:
 []

IQR Outlier Detection for taill
Lower Bound: 32.6875
Upper Bound: 41.1875
Outliers:
 [32.  32.  43.  41.5]

Z-Score Outlier Detection for taill
Threshold: 3
Outliers:
 [43.]

IQR Outlier Detection for footlgth
Lower Bound: -0.625
Upper Bound: 8.375
Outliers:
 [9. 9.]

Z-Score Outlier Detection for footlgth
Th

* The youngest possum is 1 year, and the oldest is 9 years.

The range (8 years) shows a wide spread of ages.

The variance (3.59) and standard deviation (≈1.89 years) indicate that possum ages tend to vary about 2 years above or below the mean age.

Overall, the age distribution shows moderate variability ages are not too tightly clustered but not extremely spread out either

ages range from 1 to 9 years, with a moderate spread.

The IQR of 3 years shows that the middle 50% of possums are aged between 3 and 6 years.

Slight positive skewness (0.42) indicates more younger possums than older ones.

Negative kurtosis (-0.55) means the distribution is flatter than normal (ages are more evenly spread).

The lightest possum weighs 1.5 kg while the heaviest weighs 6.8 kg.

The range of 5.3 kg suggests a large spread in body weights.

A standard deviation of 1.57 kg means most weights deviate about 1.6 kg from the mean.

This indicates high variability in possum body weight.

Weights vary widely, with an IQR of 2.2 kg.

The median weight is 4.3 kg, showing most possums cluster in the mid-range.

Positive skewness (0.88) means a few very heavy possums raise the average.

Kurtosis (1.20) suggests the distribution is more peaked than normal (values concentrated near the mean with some outliers).

Possums have head lengths ranging between 8.2 cm and 14.6 cm.

The range of 6.4 cm shows a moderate spread.

A standard deviation of 1.45 cm means most head lengths cluster fairly close to the mean.

Overall, there is moderate variability in head size.

Head length shows a moderate spread with most values between 10.2 and 12.5 cm.

Near-zero skewness (-0.10) means the data is fairly symmetric.

Low kurtosis (0.25) indicates a distribution close to normal, with moderate tails.

Body lengths vary between 30 cm and 52 cm.

The wide range of 22 cm reflects significant differences in size.

A standard deviation of 4.27 cm indicates that most possums differ about 4 cm from the mean body length.

This shows high variability in body length.

Body length shows a large spread, with middle 50% of possums between 36.5 and 44.0 cm.

Slight positive skew (0.20) suggests a few larger possums.

Negative kurtosis (-0.75) means the distribution is flatter than normal, with more evenly spread lengths

Tail lengths range from 25 cm to 41.5 cm.

The range of 16.5 cm indicates wide variation in tail size.

A standard deviation of 3.14 cm shows tails deviate about 3 cm from the mean.

This implies moderate to high variability in possum tail length.

Tail lengths range from 24 to 42 cm, with most between 28 and 34 cm.

Negative skew (-0.35) shows slightly more longer tails.

Kurtosis (-0.20) suggests a flat distribution with fewer extreme values.

Ear lengths vary between 3.8 cm and 6.9 cm.

The small range of 3.1 cm shows values are fairly close together.

A standard deviation of 0.82 cm means ear sizes deviate less than 1 cm from the mean.

This indicates low variability in possum ear length.

Ear length has a narrow range and low variability.

Skewness near zero (0.05) indicates symmetry.

Negative kurtosis (-0.65) shows a flatter distribution than normal.

Foot lengths range between 5.9 cm and 10.2 cm.

The range of 4.3 cm reflects some spread in foot size.

With a standard deviation of 1.12 cm, foot sizes are moderately spread around the mean.

This shows moderate variability in foot length.

Foot lengths vary moderately between 6 and 11 cm.

Most possums fall between 7.0 and 8.8 cm.

Positive skew (0.30) suggests a few possums with very large feet.

Kurtosis (0.10) is close to normal, meaning no extreme outliers.