## Project Description
The aim of this project is to perform statistical analysis of our data. The dataset contains details on the sizes of different body parts of a Possum, our objective is to understand and draw insights from our data by utilizing various statistical techniques.

## Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [3]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

## Data Ingestion

In [4]:
! pip install kagglehub --quiet
import kagglehub
from pathlib import Path
path = kagglehub.dataset_download("abrambeyer/openintro-possum")
possum = Path('possum.csv')
path_full = path / possum

print("Path to dataset files:", path)

df = pd.read_csv(path_full)
df.head(50)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\user47\.cache\kagglehub\datasets\abrambeyer\openintro-possum\versions\1


Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0
5,6,1,Vic,f,1.0,93.1,54.8,90.5,35.5,73.2,53.6,14.2,30.0,32.0
6,7,1,Vic,m,2.0,95.3,58.2,89.5,36.0,71.5,52.0,14.2,30.0,34.5
7,8,1,Vic,f,6.0,94.8,57.6,91.0,37.0,72.7,53.9,14.5,29.0,34.0
8,9,1,Vic,f,9.0,93.4,56.3,91.5,37.0,72.4,52.9,15.5,28.0,33.0
9,10,1,Vic,f,6.0,91.8,58.0,89.5,37.5,70.9,53.4,14.4,27.5,32.0


## Preliminary Data Analysis

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    object 
 3   sex       104 non-null    object 
 4   age       102 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  103 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.5+ KB


In [6]:
#Missing Values
df.isna().sum()

case        0
site        0
Pop         0
sex         0
age         2
hdlngth     0
skullw      0
totlngth    0
taill       0
footlgth    1
earconch    0
eye         0
chest       0
belly       0
dtype: int64

In [7]:
#Check for duplicates
df.duplicated().sum()

np.int64(0)

In [8]:
df.describe(include='all')

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
count,104.0,104.0,104,104,102.0,104.0,104.0,104.0,104.0,103.0,104.0,104.0,104.0,104.0
unique,,,2,2,,,,,,,,,,
top,,,other,m,,,,,,,,,,
freq,,,58,61,,,,,,,,,,
mean,52.5,3.625,,,3.833333,92.602885,56.883654,87.088462,37.009615,68.459223,48.130769,15.046154,27.0,32.586538
std,30.166206,2.349086,,,1.909244,3.573349,3.113426,4.310549,1.959518,4.395306,4.10938,1.050374,2.045597,2.761949
min,1.0,1.0,,,1.0,82.5,50.0,75.0,32.0,60.3,40.3,12.8,22.0,25.0
25%,26.75,1.0,,,2.25,90.675,54.975,84.0,35.875,64.6,44.8,14.4,25.5,31.0
50%,52.5,3.0,,,3.0,92.8,56.35,88.0,37.0,68.0,46.8,14.9,27.0,32.5
75%,78.25,6.0,,,5.0,94.725,58.1,90.0,38.0,72.5,52.0,15.725,28.0,34.125


In [9]:
print("Location")
print(df['Pop'].value_counts())
print('\n')
print(df['sex'].value_counts())

Location
Pop
other    58
Vic      46
Name: count, dtype: int64


sex
m    61
f    43
Name: count, dtype: int64


## Data Cleaning

In [10]:
df.dropna(axis = 0, inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 101 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      101 non-null    int64  
 1   site      101 non-null    int64  
 2   Pop       101 non-null    object 
 3   sex       101 non-null    object 
 4   age       101 non-null    float64
 5   hdlngth   101 non-null    float64
 6   skullw    101 non-null    float64
 7   totlngth  101 non-null    float64
 8   taill     101 non-null    float64
 9   footlgth  101 non-null    float64
 10  earconch  101 non-null    float64
 11  eye       101 non-null    float64
 12  chest     101 non-null    float64
 13  belly     101 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.8+ KB


In [11]:
df_copy = df.copy()
df = df.set_index('case')

## Exploratory Data Analysis

In [12]:
df_numerical = df.iloc[0:, 3:]
df_numerical.head()

Unnamed: 0_level_0,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
2,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
3,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
4,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
5,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


### Measures of Center

In [13]:
def create_df(data):
    central_measures = ['mean', 'median']
    dic_central = {}
    for i in central_measures:
        dic_central[i] = data.aggregate(i)
    dic_central

    data_central = pd.DataFrame(dic_central, index = data.columns)

    data_central['mode'] = [stats.mode(df[feature])[0] for feature in data_central.index]
    return data_central

data_central = create_df(df_numerical)
data_central




Unnamed: 0,mean,median,mode
age,3.821782,3.0,3.0
hdlngth,92.730693,92.9,93.3
skullw,56.960396,56.4,57.6
totlngth,87.269307,88.0,89.0
taill,37.049505,37.0,38.0
footlgth,68.39802,67.9,73.2
earconch,48.133663,46.8,44.9
eye,15.050495,14.9,14.5
chest,27.064356,27.0,28.0
belly,32.638614,32.5,32.0


From the information above, we can conclude that the
* Possums in our sample data has an average age of 3.8, median value of 3 and many of them were aged 3.
* Similarly, the average head length of Possums is 92.73, median value of 92.9  and the value with the highest frequency is 93.3.

### Some Observations
* Most of features are normal distributions as their mean, median and mode are nearly equal
* From our data, we notice that Possum have a bigger head length than their total length. Hence from our measures of center alone, we can conclude that their head takes a large percentage of their body size.

Next, we want to compare their sizes across gender and locations


### Gender

In [14]:
df_male = df[df['sex'] == 'm']
df_male_num = df_male.iloc[0:, 3:]

df_female = df[df['sex'] == 'f']
df_female_num = df_female.iloc[0:, 3:]

df_male_central = create_df(df_male_num)
df_female_central = create_df(df_female_num)

print('Males Only')
print(df_male_central)
print('\n')
print('Females Only')
print(df_female_central)

Males Only
               mean  median  mode
age        3.728814     3.0   3.0
hdlngth   93.081356    93.3  93.3
skullw    57.232203    56.6  57.6
totlngth  86.732203    86.0  89.0
taill     37.000000    36.5  38.0
footlgth  67.889831    66.5  73.2
earconch  47.677966    46.2  44.9
eye       15.238983    15.0  14.5
chest     26.864407    27.0  28.0
belly     32.423729    32.0  32.0


Females Only
               mean  median  mode
age        3.952381    3.50   3.0
hdlngth   92.238095   92.50  93.3
skullw    56.578571   56.35  57.6
totlngth  88.023810   88.75  89.0
taill     37.119048   37.75  38.0
footlgth  69.111905   70.45  73.2
earconch  48.773810   50.80  44.9
eye       14.785714   14.75  14.5
chest     27.345238   28.00  28.0
belly     32.940476   33.00  32.0


In [15]:
df_male_central.rename(columns = {i: f'males_{i}' for i in df_male_central.columns}, inplace = True)
df_female_central.rename(columns = {i: f'females_{i}' for i in df_female_central.columns}, inplace = True)
df_sex_central = pd.concat([df_male_central, df_female_central], axis = 1)
df_sex_central

Unnamed: 0,males_mean,males_median,males_mode,females_mean,females_median,females_mode
age,3.728814,3.0,3.0,3.952381,3.5,3.0
hdlngth,93.081356,93.3,93.3,92.238095,92.5,93.3
skullw,57.232203,56.6,57.6,56.578571,56.35,57.6
totlngth,86.732203,86.0,89.0,88.02381,88.75,89.0
taill,37.0,36.5,38.0,37.119048,37.75,38.0
footlgth,67.889831,66.5,73.2,69.111905,70.45,73.2
earconch,47.677966,46.2,44.9,48.77381,50.8,44.9
eye,15.238983,15.0,14.5,14.785714,14.75,14.5
chest,26.864407,27.0,28.0,27.345238,28.0,28.0
belly,32.423729,32.0,32.0,32.940476,33.0,32.0


In [16]:
df_sex_central['mean_sex_diff'] = df_sex_central['males_mean'] - df_sex_central['females_mean']
df_sex_central['median_sex_diff'] = df_sex_central['males_median'] - df_sex_central['females_median']
df_sex_central

Unnamed: 0,males_mean,males_median,males_mode,females_mean,females_median,females_mode,mean_sex_diff,median_sex_diff
age,3.728814,3.0,3.0,3.952381,3.5,3.0,-0.223567,-0.5
hdlngth,93.081356,93.3,93.3,92.238095,92.5,93.3,0.843261,0.8
skullw,57.232203,56.6,57.6,56.578571,56.35,57.6,0.653632,0.25
totlngth,86.732203,86.0,89.0,88.02381,88.75,89.0,-1.291606,-2.75
taill,37.0,36.5,38.0,37.119048,37.75,38.0,-0.119048,-1.25
footlgth,67.889831,66.5,73.2,69.111905,70.45,73.2,-1.222074,-3.95
earconch,47.677966,46.2,44.9,48.77381,50.8,44.9,-1.095843,-4.6
eye,15.238983,15.0,14.5,14.785714,14.75,14.5,0.453269,0.25
chest,26.864407,27.0,28.0,27.345238,28.0,28.0,-0.480831,-1.0
belly,32.423729,32.0,32.0,32.940476,33.0,32.0,-0.516747,-1.0


### Observations
From the table above, we make the following observations:
* Females are older than males on an average
* Females are bigger than their male counterparts except  in head length.
* Males have a higher skull width than females.
* Males have bigger eyes than females.

### Location

In [17]:
df_vic = df[df['Pop'] == 'Vic']
df_vic_num = df_vic.iloc[0:, 3:]

df_other = df[df['Pop'] == 'other']
df_other_num = df_other.iloc[0:, 3:]

df_vic_central = create_df(df_vic_num)
df_other_central = create_df(df_other_num)

print('Possums caught in Vic')
print(df_vic_central)
print('\n')
print('Possums caught in other locations')
print(df_other_central)

Possums caught in Vic
               mean  median  mode
age        4.000000     3.0   3.0
hdlngth   92.897674    93.3  93.3
skullw    56.818605    56.3  57.6
totlngth  87.918605    89.0  89.0
taill     35.953488    36.0  38.0
footlgth  72.437209    72.8  73.2
earconch  52.476744    52.2  44.9
eye       14.869767    14.9  14.5
chest     27.627907    28.0  28.0
belly     32.790698    33.0  32.0


Possums caught in other locations
               mean  median  mode
age        3.689655     3.0   3.0
hdlngth   92.606897    92.4  93.3
skullw    57.065517    56.4  57.6
totlngth  86.787931    86.5  89.0
taill     37.862069    38.0  38.0
footlgth  65.403448    65.2  73.2
earconch  44.913793    44.9  44.9
eye       15.184483    15.0  14.5
chest     26.646552    26.0  28.0
belly     32.525862    32.5  32.0


In [18]:
df_vic_central.rename(columns = {i: f'vic_{i}' for i in df_vic_central.columns}, inplace = True)
df_other_central.rename(columns = {i: f'other_{i}' for i in df_other_central.columns}, inplace = True)
df_location_central = pd.concat([df_vic_central, df_other_central], axis = 1)
df_location_central

Unnamed: 0,vic_mean,vic_median,vic_mode,other_mean,other_median,other_mode
age,4.0,3.0,3.0,3.689655,3.0,3.0
hdlngth,92.897674,93.3,93.3,92.606897,92.4,93.3
skullw,56.818605,56.3,57.6,57.065517,56.4,57.6
totlngth,87.918605,89.0,89.0,86.787931,86.5,89.0
taill,35.953488,36.0,38.0,37.862069,38.0,38.0
footlgth,72.437209,72.8,73.2,65.403448,65.2,73.2
earconch,52.476744,52.2,44.9,44.913793,44.9,44.9
eye,14.869767,14.9,14.5,15.184483,15.0,14.5
chest,27.627907,28.0,28.0,26.646552,26.0,28.0
belly,32.790698,33.0,32.0,32.525862,32.5,32.0


### Observations
From the table above, we make the following observations:
* Possums caught in Victoria are generally older than Possums caught in other locations.
* Possums caught in Victoria are generally bigger than Possums caught in other locatons.
* Males have a higher skull width than females.
* Males have bigger eyes than females.

## Measures of Spread

In [19]:
def create_spread(data):
    data['skew'] = [stats.skew(df[feature]) for feature in data.index]
    data['kurtosis'] = [stats.kurtosis(df[feature]) for feature in data.index]
    data['Q1'] = [np.quantile(df[feature], 0.25) for feature in data_central.index]
    data['Q3'] = [np.quantile(df[feature], 0.75) for feature in data_central.index]
    data['IQR'] = data_central['Q3'] - data['Q1']
    data['Range'] = [max(df[feature]) - min(df[feature]) for feature in data.index]
    data['std'] = [np.std(df[feature]) for feature in data.index]
    data['var'] = [np.var(df[feature]) for feature in data.index]
    data['CV'] = data['std'] / data['mean']
    return data

data_spread = create_spread(data_central)
data_spread



Unnamed: 0,mean,median,mode,skew,kurtosis,Q1,Q3,IQR,Range,std,var,CV
age,3.821782,3.0,3.0,0.547475,-0.317807,2.0,5.0,3.0,8.0,1.905677,3.631605,0.498636
hdlngth,92.730693,92.9,93.3,-0.054117,0.925124,90.7,94.8,4.1,20.6,3.501251,12.258761,0.037757
skullw,56.960396,56.4,57.6,1.01662,2.337963,55.0,58.1,3.1,18.6,3.087281,9.531303,0.0542
totlngth,87.269307,88.0,89.0,-0.245144,-0.212877,84.5,90.0,5.5,21.5,4.175974,17.438761,0.047852
taill,37.049505,37.0,38.0,0.121612,0.311398,36.0,38.0,2.0,11.0,1.961896,3.849034,0.052953
footlgth,68.39802,67.9,73.2,0.122903,-1.195542,64.5,72.5,8.0,17.6,4.391598,19.286135,0.064207
earconch,48.133663,46.8,44.9,0.215128,-1.379705,44.8,52.0,7.2,14.9,4.040201,16.323223,0.083937
eye,15.050495,14.9,14.5,0.376387,-0.023105,14.4,15.7,1.3,5.0,1.053389,1.109628,0.06999
chest,27.064356,27.0,28.0,-0.057535,-0.282777,25.5,28.0,2.5,10.0,2.010693,4.042888,0.074293
belly,32.638614,32.5,32.0,0.104267,0.167455,31.0,34.0,3.0,15.0,2.714208,7.366925,0.083159


### Observations
From the dataset above, we note that co-efficient of variation are below 10% which implies our data is stable. On the other hand, the age feature has a fairly high CV which indicates high flunctuations in our age data.

### Outlier Detection

In [20]:
def get_outlier(col, data):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[col] > upper_bound) | (data[col] < lower_bound)][col]

outliers = {col: [get_outlier(col, df_numerical).index, get_outlier(col, df_numerical).values] for col in df_numerical}
outliers



{'age': [Index([], dtype='int64', name='case'), array([], dtype=float64)],
 'hdlngth': [Index([55, 59, 73], dtype='int64', name='case'),
  array([103.1, 102.5,  82.5])],
 'skullw': [Index([17, 48, 54, 55, 58, 59, 79, 98], dtype='int64', name='case'),
  array([67.7, 63.2, 63. , 63.2, 64.2, 62.8, 50. , 68.6])],
 'totlngth': [Index([39], dtype='int64', name='case'), array([75.])],
 'taill': [Index([42, 43, 54, 87], dtype='int64', name='case'),
  array([32. , 32. , 43. , 41.5])],
 'footlgth': [Index([], dtype='int64', name='case'), array([], dtype=float64)],
 'earconch': [Index([], dtype='int64', name='case'), array([], dtype=float64)],
 'eye': [Index([76], dtype='int64', name='case'), array([17.8])],
 'chest': [Index([59], dtype='int64', name='case'), array([32.])],
 'belly': [Index([21, 39, 57], dtype='int64', name='case'),
  array([40., 25., 39.])]}

In [24]:
print('Columns containing outliers and the values')
print('\n')
for i, j in outliers.items():
    if len(j[1]) != 0:
        print(f'{i}: {j[1]}')


Columns containing outliers and the values


hdlngth: [103.1 102.5  82.5]
skullw: [67.7 63.2 63.  63.2 64.2 62.8 50.  68.6]
totlngth: [75.]
taill: [32.  32.  43.  41.5]
eye: [17.8]
chest: [32.]
belly: [40. 25. 39.]


### Observations
Our detect outlier function indicates the presence of outliers in age, hdlngth, skullw, totlngth, taill, eye, chest, belly features