Certainly! To ensure that the sample data of your trained model has not significantly changed from the new collected test data, you can use statistical tests to compare the distributions of the features in both datasets. A common approach is to use the Kolmogorov-Smirnov (KS) test or the Chi-square test for categorical data. Here, I'll provide an example using the KS test for continuous features.

Explanation
Kolmogorov-Smirnov Test: This non-parametric test compares the distributions of two samples to determine if they differ significantly. It is useful for continuous data and does not assume any specific distribution.

Chi-Square Test: This test compares the distribution of categorical data.

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp, chi2_contingency

In [9]:
def compare_distributions(train_data: pd.DataFrame, test_data: pd.DataFrame):
    """
    Compares the distributions of the features in the training data with the test data.

    Args:
        train_data: DataFrame containing the training data.
        test_data: DataFrame containing the test data.

    Returns:
        A dictionary with the test results.
    """
    results = {}

    for column in train_data.columns:
        if train_data[column].dtype in [np.float64, np.float32, np.int64, np.int32]:  # Continuous data
            # Perform KS test
            stat, p_value = ks_2samp(train_data[column], test_data[column])
            results[column] = {'test': 'KS', 'statistic': stat, 'p_value': p_value}
        else:  # Categorical data
            # Perform Chi-square test
            contingency_table = pd.crosstab(train_data[column], test_data[column])
            stat, p_value, dof, expected = chi2_contingency(contingency_table)
            results[column] = {'test': 'Chi-square', 'statistic': stat, 'p_value': p_value}

    return results

In [11]:
# Example usage
# Assuming `train_df` is your training data and `test_df` is your new collected test data
train_df = pd.DataFrame({
    'age': np.random.randint(20, 60, 1000),
    'income': np.random.normal(50000, 15000, 1000),
    'gender': np.random.choice(['M', 'F'], 1000)
})

test_df = pd.DataFrame({
    'age': np.random.randint(20, 60, 200),
    'income': np.random.normal(51000, 15500, 200),
    'gender': np.random.choice(['M', 'F'], 200)
})

results = compare_distributions(train_df, test_df)

# Display the results
for feature, result in results.items():
    print(f"Feature: {feature}")
    print(f"Test: {result['test']}")
    print(f"Statistic: {result['statistic']}")
    print(f"P-value: {result['p_value']}")
    print("\n")

Feature: age
Test: KS
Statistic: 0.063
P-value: 0.509500976763525


Feature: income
Test: KS
Statistic: 0.117
P-value: 0.01969347905750255


Feature: gender
Test: Chi-square
Statistic: 0.08082677910530667
P-value: 0.7761800965260952




#### Interpretation of Results

##### P-value: 
        The p-value indicates the probability that the observed difference between the distributions happened by chance. A low p-value (typically < 0.05) suggests a significant difference between the distributions.
Statistic: The test statistic gives a measure of the difference between the distributions.
This approach ensures that you can statistically compare the distributions of your features between the training and new test data, helping you detect any significant changes.

#### Load the Iris Dataset

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy.stats import ks_2samp, chi2_contingency
from faker import Faker
from datetime import datetime, timedelta

# Load the Titanic dataset
df = sns.load_dataset('titanic').dropna()

# Split the data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
print(train_df.head())
print(test_df.head())

     survived  pclass     sex   age  sibsp  parch      fare embarked  class  \
779         1       1  female  43.0      0      1  211.3375        S  First   
741         0       1    male  36.0      1      0   78.8500        S  First   
540         1       1  female  36.0      0      2   71.0000        S  First   
716         1       1  female  38.0      0      0  227.5250        C  First   
151         1       1  female  22.0      1      0   66.6000        S  First   

       who  adult_male deck  embark_town alive  alone  
779  woman       False    B  Southampton   yes  False  
741    man        True    C  Southampton    no  False  
540  woman       False    B  Southampton   yes  False  
716  woman       False    C    Cherbourg   yes   True  
151  woman       False    C  Southampton   yes  False  
     survived  pclass     sex   age  sibsp  parch      fare embarked  class  \
118         0       1    male  24.0      0      1  247.5208        C  First   
251         0       3  female  

#### Define Functions for Data Drift Detection

##### Kolmogorov-Smirnov Test for Numerical Features

In [33]:
def ks_test(train_data, test_data, feature_name):
    ks_stat, p_value = ks_2samp(train_data[feature_name], test_data[feature_name])
    return ks_stat, p_value

##### Chi-Square Test for Categorical Features

In [36]:
def chi_square_test(train_data, test_data, feature_name):
    contingency_table = pd.crosstab(train_data[feature_name], test_data[feature_name])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    return chi2, p_value


##### Apply Population Stability Index (PSI)

In [20]:
def calculate_psi(expected, actual, buckets=10):
    def scale_range(input, min, max):
        input += -(np.min(input))
        input /= np.max(input) / (max - min)
        input += min
        return input
    
    breakpoints = np.arange(0, buckets + 1) / (buckets) * 100
    
    expected_percents = np.histogram(scale_range(expected, 0, 100), breakpoints)[0] / len(expected)
    actual_percents = np.histogram(scale_range(actual, 0, 100), breakpoints)[0] / len(actual)
    
    def sub_psi(e_perc, a_perc):
        if a_perc == 0:
            a_perc = 0.0001
        if e_perc == 0:
            e_perc = 0.0001
        return (e_perc - a_perc) * np.log(e_perc / a_perc)
    
    psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))
    
    return psi_value

# Apply PSI to each feature
for feature in df.columns:
    psi_value = calculate_psi(train_df[feature], test_df[feature])
    print(f'Feature: {feature}')
    print(f'PSI Value: {psi_value}')
    if psi_value >= 0.2:
        print("Significant data drift detected (PSI >= 0.2).")
    elif psi_value >= 0.1:
        print("Moderate data drift detected (0.1 <= PSI < 0.2).")
    else:
        print("No significant data drift detected (PSI < 0.1).")
    print('-' * 30)


Feature: sepal length (cm)
PSI Value: 0.5743322323195329
Significant data drift detected (PSI >= 0.2).
------------------------------
Feature: sepal width (cm)
PSI Value: 0.1458452581879954
Moderate data drift detected (0.1 <= PSI < 0.2).
------------------------------
Feature: petal length (cm)
PSI Value: 0.3418244382092846
Significant data drift detected (PSI >= 0.2).
------------------------------
Feature: petal width (cm)
PSI Value: 0.9391684203412669
Significant data drift detected (PSI >= 0.2).
------------------------------


  psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))


#### Apply Data Drift Detection Methods

In [39]:
# Apply data drift detection to each feature
for feature in df.columns:
    if pd.api.types.is_numeric_dtype(df[feature]):
        # Apply KS test for numerical features
        ks_stat, p_value = ks_test(train_df, test_df, feature)
        psi_value = calculate_psi(train_df[feature], test_df[feature])
        print(f'Numerical Feature: {feature}')
        print(f'KS Statistic: {ks_stat}, P-value: {p_value}')
        print(f'PSI Value: {psi_value}')
        if p_value < 0.05:
            print("Data drift detected (KS test)")
        if psi_value >= 0.2:
            print("Significant data drift detected (PSI >= 0.2)")
        elif psi_value >= 0.1:
            print("Moderate data drift detected (0.1 <= PSI < 0.2)")
        else:
            print("No significant data drift detected (PSI < 0.1)")
    else:
        # Apply Chi-Square test for categorical features
        chi2_stat, chi2_p_value = chi_square_test(train_df, test_df, feature)
        psi_value = calculate_psi(train_df[feature].value_counts(normalize=True).values, 
                                  test_df[feature].value_counts(normalize=True).values)
        print(f'Categorical Feature: {feature}')
        print(f'Chi-Square Statistic: {chi2_stat}, P-value: {chi2_p_value}')
        print(f'PSI Value: {psi_value}')
        if chi2_p_value < 0.05:
            print("Data drift detected (Chi-Square test)")
        if psi_value >= 0.2:
            print("Significant data drift detected (PSI >= 0.2)")
        elif psi_value >= 0.1:
            print("Moderate data drift detected (0.1 <= PSI < 0.2)")
        else:
            print("No significant data drift detected (PSI < 0.1)")
    print('-' * 30)

  psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))
  psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))


Numerical Feature: survived
KS Statistic: 0.10866141732283464, P-value: 0.7044749489139502
PSI Value: 0.05252982491476137
No significant data drift detected (PSI < 0.1)
------------------------------
Numerical Feature: pclass
KS Statistic: 0.026628489620615606, P-value: 0.9999999999999742
PSI Value: 0.01662110333819009
No significant data drift detected (PSI < 0.1)
------------------------------


ValueError: No data; `observed` has size 0.

In [None]:

# Apply KS test to each feature
for feature in df.columns:
    ks_stat, p_value = ks_test(train_df, test_df, feature)
    print(f'Feature: {feature}')
    print(f'KS Statistic: {ks_stat}')
    print(f'P-value: {p_value}')
    print("Data drift detected" if p_value < 0.05 else "No data drift detected")
    print('-' * 30)