In [2]:
import pandas as pd

from sklearn import datasets
iris_data = datasets.load_iris(as_frame=True)
iris_frame = iris_data.frame

# KS Test 

KS Test is performed where current data and reference data are almost same.

In [4]:
from scipy.stats import ks_2samp

def ks_test(data1, data2, alpha=0.05):
    """
    Perform the Kolmogorov-Smirnov (KS) test for two samples.

    Parameters:
        data1 (array-like): First sample data.
        data2 (array-like): Second sample data.
        alpha (float): Significance level (default: 0.05).

    Returns:
        bool: True if null hypothesis is rejected, indicating data drift.
    """
    # Perform KS test
    ks_statistic, p_value = ks_2samp(data1, data2)

    # Compare p-value to significance level
    if p_value < alpha:
        print("Data drift detected! (p-value: {:.4f})".format(p_value))
        return True
    else:
        print("No data drift detected. (p-value: {:.4f})".format(p_value))
        return False

# Perform KS test for data drift detection
current_data=iris_frame.iloc[:75]
reference_data=iris_frame.iloc[:60]
sample1 = current_data['petal width (cm)'].values
sample2 = reference_data['petal width (cm)'].values

ks_test(sample1, sample2)


No data drift detected. (p-value: 0.2819)


False

KS Test is performed where current data and reference data are completely different.

In [5]:
from scipy.stats import ks_2samp

def ks_test(data1, data2, alpha=0.05):
    """
    Perform the Kolmogorov-Smirnov (KS) test for two samples.

    Parameters:
        data1 (array-like): First sample data.
        data2 (array-like): Second sample data.
        alpha (float): Significance level (default: 0.05).

    Returns:
        bool: True if null hypothesis is rejected, indicating data drift.
    """
    # Perform KS test
    ks_statistic, p_value = ks_2samp(data1, data2)

    # Compare p-value to significance level
    if p_value < alpha:
        print("Data drift detected! (p-value: {:.4f})".format(p_value))
        return True
    else:
        print("No data drift detected. (p-value: {:.4f})".format(p_value))
        return False

# Perform KS test for data drift detection
current_data=iris_frame.iloc[:60]
reference_data=iris_frame.iloc[60:]
sample1 = current_data['petal width (cm)'].values
sample2 = reference_data['petal width (cm)'].values

ks_test(sample1, sample2)


Data drift detected! (p-value: 0.0000)


True

# Chi-Square Test

In [7]:
import pandas as pd
from sklearn.datasets import load_wine
from scipy.stats import chi2_contingency

# Load Wine dataset
wine_data = load_wine(as_frame=True)
wine_frame = wine_data.frame

# Simulate current data by shuffling the original data and modifying the target
current_data = wine_frame.sample(frac=1.0, random_state=42).reset_index(drop=True)
current_data['target'] = current_data['target'].apply(lambda x: (x + 1) % 3)  # Modify the target to simulate drift

# Create contingency table
contingency_table = pd.crosstab(current_data['target'], wine_frame['target'])

# Perform Chi-Square Test
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"Chi2 Stat: {chi2_stat}, P-Value: {p_value}, Degrees of Freedom: {dof}")

# Interpret the result
if p_value < 0.05:
    print("Significant drift detected in the target variable.")
else:
    print("No significant drift detected in the target variable.")


Chi2 Stat: 1.4391214977590405, P-Value: 0.8373678077554415, Degrees of Freedom: 4
No significant drift detected in the target variable.


# Z-Test

In [12]:
from statsmodels.stats.weightstats import ztest

# Example data
sample1 = [23, 21, 18, 25, 30]
sample2 = [27, 28, 29, 26, 22]

# Two-sample Z-test
z_stat, p_value = ztest(sample1, sample2)
print(f"Z-Statistic: {z_stat}, P-Value: {p_value}")
alpha = 0.05  # Define significance level
if p_value < alpha:
    print("Data drift detected! (Reject null hypothesis)")
else:
    print("No data drift detected. (Fail to reject null hypothesis)")


Z-Statistic: -1.276884796138123, P-Value: 0.2016429225591495
No data drift detected. (Fail to reject null hypothesis)


# T-Test

In [13]:
from scipy import stats

# Example data
sample1 = [23, 21, 18, 25, 30]
sample2 = [27, 28, 29, 26, 22]

# Independent two-sample t-test
t_stat, p_value = stats.ttest_ind(sample1, sample2)
print(f"T-Statistic: {t_stat}, P-Value: {p_value}")
alpha = 0.05  # Define significance level
if p_value < alpha:
    print("Data drift detected! (Reject null hypothesis)")
else:
    print("No data drift detected. (Fail to reject null hypothesis)")


T-Statistic: -1.276884796138123, P-Value: 0.2374560613820748
No data drift detected. (Fail to reject null hypothesis)
