# Kolmogorov - Smirnov Test

In [23]:
# Step 0 - Load the Data

# Import libraries
import pandas as pd
import numpy as np
import statistics

# Load the data
file_name = "Example_K_S.xlsx"
path = "C:/Users/usuario/Downloads/"
df = pd.read_excel(path + file_name)
df.head(30)

Unnamed: 0,Demand of a Product
0,67
1,63
2,33
3,69
4,53
5,51
6,49
7,78
8,48
9,42


In [24]:
# Step 1 - Order the data

df_1 = df.sort_values(by = "Demand of a Product") #ascending = True)
df_1.head()

Unnamed: 0,Demand of a Product
19,25
18,28
21,32
2,33
25,35


In [25]:
# Step 2 - Add a column 𝑓% (frequency) indicating how many times each number appears.

df_1["Frecuency"] = df_1.groupby('Demand of a Product', sort=False).cumcount() + 1
#df_2 = df_1.drop_duplicates()
df_2 = df_1
df_2.head(30)

Unnamed: 0,Demand of a Product,Frecuency
19,25,1
18,28,1
21,32,1
2,33,1
25,35,1
20,36,1
24,38,1
9,42,1
15,44,1
16,44,2


In [26]:
# Step 3 - Add Observed relative cumulative frequency column

df_2["Count"] = np.arange(1, len(df_2) + 1)

df_2["Observed Relative Cumulative Frequency"] = df_2["Count"] / len(df)
df_3 = df_2
df_3.head(30)

Unnamed: 0,Demand of a Product,Frecuency,Count,Observed Relative Cumulative Frequency
19,25,1,1,0.033333
18,28,1,2,0.066667
21,32,1,3,0.1
2,33,1,4,0.133333
25,35,1,5,0.166667
20,36,1,6,0.2
24,38,1,7,0.233333
9,42,1,8,0.266667
15,44,1,9,0.3
16,44,2,10,0.333333


In [32]:
# Step 4 - Add expected relative cumulative frequency column

normal = statistics.NormalDist(50, 10)

df_3["Expected Relative Cumulative Frequency"] = df_3["Demand of a Product"].apply(lambda x: normal.cdf(x))
df_4 = df_3
df_4.head()

Unnamed: 0,Demand of a Product,Frecuency,Count,Observed Relative Cumulative Frequency,Expected Relative Cumulative Frequency
19,25,1,1,0.033333,0.00621
18,28,1,2,0.066667,0.013903
21,32,1,3,0.1,0.03593
2,33,1,4,0.133333,0.044565
25,35,1,5,0.166667,0.066807


In [33]:
# Step 5 - Add difference column

df_4["Difference"] = abs(df_4["Observed Relative Cumulative Frequency"]-df_4["Expected Relative Cumulative Frequency"])
df_5 = df_4
df_5.head()

Unnamed: 0,Demand of a Product,Frecuency,Count,Observed Relative Cumulative Frequency,Expected Relative Cumulative Frequency,Difference
19,25,1,1,0.033333,0.00621,0.027124
18,28,1,2,0.066667,0.013903,0.052763
21,32,1,3,0.1,0.03593,0.06407
2,33,1,4,0.133333,0.044565,0.088768
25,35,1,5,0.166667,0.066807,0.099859


In [34]:
# Step 6 - Get the max of the difference

Dn = max(df_5["Difference"])
print(Dn)

0.12574688224992636


In [35]:
# Step 7 - Compare Critical Value of K-S vs Dn Value

cv = 0.24

if (Dn <= cv):
    print("Your data fits with a normal distribution N(50,10)")
else:
    print("Your data DO NOT fits with a normal distribution N(50,10)")

Your data fits with a normal distribution N(50,10)
