In [104]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis, iqr, skew, zscore
import matplotlib.pyplot as plt
import plotly.express as px     # histogram, box-plot
import seaborn as sns
import math

In [102]:
df = pd.read_excel("C:/Users/go27s/OneDrive/Documents/Udemy/Statistics and Probability/Statistics/USArrests.xlsx")

In [103]:
df.head()

Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6


In [None]:
# Std - measure of the amount of variation or dispersion of a set of values. Square root of variance
# Let’s say we have the data of population per square kilometer for different states in the USA. 
# We can calculate the standard deviation to find out how the population is evenly distributed. 
# A smaller value means that the distribution is even while a larger value means there are very few people living in some places
# whereas some areas are densely populated.

def x_mean(x):
    x_mean = np.sum(x) / len(str(x))                     # sum of all values / no of values
    return x_mean


def x_range(x):
    x_range = np.max(x) - np.min(x)
    return x_range


def x_variance(x):
    x_mean = np.sum(x) / len(str(x))                        # sum of all observations / no of observations
    x_deviation = [(x - x_mean) ** 2 for i in x]     # for each value minus the mean, then square each result
    x_variance = np.sum(x_deviation) / len(str(x))          # sum all the results and divide by number of values
    return x_variance


def x_stdev(x):
    x_var = x_variance(x)
    x_std = np.sqrt(x_var)                       # square root of variance
    return x_std

In [11]:
# IQR
# If the data doesn’t follow a normal distribution, calculate the outlier data points using the statistical method 
# called interquartile range (IQR) instead of using Z-score.
# Using the IQR, the outlier data points are the ones falling below Q1–1.5 IQR or above Q3 + 1.5 IQR. 
# The Q1 is the 25th percentile and Q3 is the 75th percentile of the dataset, 
# and IQR represents the interquartile range calculated by Q3 minus Q1 (Q3–Q1).

def find_iqr(x):
    return np.subtract(*np.percentile(x, [75, 25]))

# calculate IQR for Assault & Fraud columns

df[['Assault', 'Fraud']].apply(find_iqr)

# calculate IQR for all columns
# df.apply(find_iqr)

# ISSUE
# I ran into the same error message using the standard max() function. 
# Replacing it with with numpy.maximum() for element-wise maxima between two values solved my problem


Assault    140.0
Fraud       11.1
dtype: float64

In [27]:
# z-score tells us how many standard deviations away a given value lies from population mean.
# z = (individual value - mean of population) / std
# positive z-score - the individual value is greater than the mean
# negative z-score - the individual value is less than the mean
# z-score is 0 - the individual value is equal to the mean
# the larger the absolute value of z-score, the further away the individual value lies from the mean
# eg z-score = 1.75 

df['Assault_mean'] = df['Assault'].mean()
df['Assault_std'] = df['Assault'].std()

df['Assault_zscore'] = (df['Assault'] - df['Assault_mean']) / df['Assault_std']
df_outliers_assault = df[abs(df['Assault_zscore']) > 1.5]
df_outliers_assault

Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud,Assault_mean,Assault_std,Assault_zscore
8,Florida,15.4,335,80,31.9,170.76,83.337661,1.970778
19,Maryland,11.3,300,67,27.8,170.76,83.337661,1.550799
32,North Carolina,13.0,337,45,16.1,170.76,83.337661,1.994776
33,North Dakota,0.8,45,44,7.3,170.76,83.337661,-1.509042


In [112]:
# check for outliers in UrbanPop using IQR

def find_outliers_IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[((df < (Q1 - 1.5) * IQR) | (df > (Q3 + 1.5) * IQR))]  # use | instead of or 
    return outliers

    # Option to drop outliers
    
    # outliers_dropped = outliers.dropna().reset_index()
    # return outliers_dropped


df['UrbanPop_iqr'] = find_outliers_IQR(df['UrbanPop'])
#np.where(df['UrbanPop_iqr'] > 2) 


In [54]:
# check for outliers in UrbanPop using z-score

def find_zscore(x):
    x_mean = np.mean(x)
    x_std  = np.std(x)
    x_zscore = (x - x_mean) / x_std
    return x_zscore

df['UrbanPop_zscore'] = find_zscore(df['UrbanPop'])
df_outliers_UrbanPop = df[abs(df['UrbanPop_zscore']) > 2]
df_outliers_UrbanPop

Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud,Assault_mean,Assault_std,Assault_zscore,UrbanPop_zscore,UrbanPop_iqr
20,Massachusetts,4.4,149,30000,16.3,170.76,83.337661,-0.261106,6.999961,30000.0


In [100]:
# Calculate Range, Std and Variance for UrbanPop and Assault

df['UrbanPop_range'] = np.max(df['UrbanPop']) - np.min(df['UrbanPop'])
df.head()

Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud,Assault_mean,Assault_std,Assault_zscore,UrbanPop_zscore,UrbanPop_iqr,UrbanPop_range,UrbanPop_std,UrbanPop_var,Assault_range,Assault_var
0,Alabama,13.2,236,58,21.2,170.76,83.337661,0.782839,-0.144561,,29968,8450.15765,71405160.0,292,145610.550579
1,Alaska,10.0,263,48,44.5,170.76,83.337661,1.106823,-0.146947,,29968,8450.15765,71405160.0,292,145610.550579
2,Arizona,8.1,294,80,31.0,170.76,83.337661,1.478803,-0.139311,,29968,8450.15765,71405160.0,292,145610.550579
3,Arkansas,8.8,190,50,19.5,170.76,83.337661,0.230868,-0.14647,,29968,8450.15765,71405160.0,292,145610.550579
4,California,9.0,276,91,40.6,170.76,83.337661,1.262814,-0.136687,,29968,8450.15765,71405160.0,292,145610.550579


In [93]:
df['UrbanPop_std'] = x_stdev(df['UrbanPop'])
df['UrbanPop_var'] = x_variance(df['UrbanPop'])

In [98]:
df['Assault_range'] = x_range(df['Assault'])
df['Assault_var'] = x_variance(df['Assault'])

In [99]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Murder,Assault,UrbanPop,Fraud,Assault_mean,Assault_std,Assault_zscore,UrbanPop_zscore,UrbanPop_iqr,UrbanPop_range,UrbanPop_std,UrbanPop_var,Assault_range,Assault_var
0,Alabama,13.2,236,58,21.2,170.76,83.337661,0.782839,-0.144561,,29968,8450.15765,71405160.0,292,145610.550579
1,Alaska,10.0,263,48,44.5,170.76,83.337661,1.106823,-0.146947,,29968,8450.15765,71405160.0,292,145610.550579
2,Arizona,8.1,294,80,31.0,170.76,83.337661,1.478803,-0.139311,,29968,8450.15765,71405160.0,292,145610.550579
3,Arkansas,8.8,190,50,19.5,170.76,83.337661,0.230868,-0.14647,,29968,8450.15765,71405160.0,292,145610.550579
4,California,9.0,276,91,40.6,170.76,83.337661,1.262814,-0.136687,,29968,8450.15765,71405160.0,292,145610.550579


In [None]:
# Skewness - a way of estimating and measuring the shape of a distribution
# A positive skew will indicate that the tail is on the right side. It will extend toward the most positive values.
# On the other hand, a negative skew will indicate a tail on the left side and will extend to the more negative side.
# A zero value will indicate that there is no skewness in the distribution, which means that the distribution is perfectly symmetrical.

# Skewness = 0 when the distribution is normal.
# Skewness > 0 or positive when more weight is on the left side of the distribution.
# Skewness < 0 or negative when more weight is on the right side of the distribution.

# X = [54, 73, 59, 98, 68, 45, 88, 92, 75, 96]
# mean = 74.8
# skw = [(54 - 74.8)**3 - (73 - 74.8)**3, ...] / 10

In [111]:
# Kurtosis 
kurtosis(df[['UrbanPop', 'Assault']], axis = 0, fisher = False)  # Pearson's kurtosis

array([48.01932904,  1.93097995])

In [107]:
kurtosis(df[['UrbanPop', 'Assault']], axis = 0, fisher = True) # Fisher's kurtosis

array([45.01932904, -1.06902005])

In [110]:
skew(df[['UrbanPop', 'Assault']], axis = 0)   # positive skewness, which means left side

array([6.85702231, 0.22731787])