In [101]:
import pickle 

with open('yk_values_dataframe.pkl', 'rb') as file:
    yk_values = pickle.load(file)

with open('ty_values_dataframe.pkl', 'rb') as file:
    ty_values = pickle.load(file)

with open('luna_values_dataframe.pkl', 'rb') as file:
    luna_values = pickle.load(file)

In [102]:
print("YK: ", yk_values.shape)
print("TY: ", ty_values.shape)
print("Luna: ", luna_values.shape)

YK:  (69579, 1025)
TY:  (79785, 1402)
Luna:  (10746, 3370)


In [103]:
import pandas as pd
import numpy as np

df = pd.concat([yk_values, ty_values, luna_values], axis=1).T
df.shape

(5797, 79785)

In [104]:
df_values = df.iloc[:, 2:] # each row represents a test with all its values
df_values = df_values.reset_index().drop(columns='index')

df_target = df.iloc[:, :2] # each row is test name and distribution
df_target = df_target.reset_index().drop(columns='index')

In [129]:
import numpy as np
import scipy.stats as stats
import pandas as pd

def extract_qq_features(data, theoretical_distribution='norm'):
    features = []
    for index, row in data.iterrows():
        #row = row.dropna()
        row = pd.to_numeric(row, errors='coerce').dropna().values
        if len(row) < 2:
            features.append({
                'Mean_Deviation': np.nan,
                'Max_Deviation': np.nan,
                'Std_Deviation': np.nan,
                'Mean_Abs_Deviation': np.nan,
                'Median_Abs_Deviation': np.nan,
                'Q1_Deviation': np.nan,
                'Q5_Deviation': np.nan,
                'Q25_Deviation': np.nan,
                'Q50_Deviation': np.nan,
                'Q75_Deviation': np.nan,
                'Q95_Deviation': np.nan,
                'Q99_Deviation': np.nan,
                'AD_Stat': np.nan,
                'Line_Fit_R_Squared': np.nan,
            })
            continue

        qq_plot = stats.probplot(row, dist=theoretical_distribution, plot=None)
        quantiles_theoretical = qq_plot[0][0]  # Theoretical quantiles
        quantiles_empirical = qq_plot[0][1]    # Empirical quantiles
        deviations = np.array(quantiles_empirical) - np.array(quantiles_theoretical)
        mean_abs_deviation = np.mean(np.abs(deviations))
        median_abs_deviation = np.median(np.abs(deviations))
        ad_stat = stats.anderson(row, dist=theoretical_distribution).statistic
        coeffs = np.polyfit(quantiles_theoretical, quantiles_empirical, 1)
        line_fit_r_squared = np.corrcoef(quantiles_theoretical, quantiles_empirical)[0, 1]**2

        features.append({
            'Mean_Deviation': np.mean(deviations),
            'Max_Deviation': np.max(deviations),
            'Std_Deviation': np.std(deviations),
            'Mean_Abs_Deviation': mean_abs_deviation,
            'Median_Abs_Deviation': median_abs_deviation,
            'Q1_Deviation': np.abs(np.percentile(row, 1) - np.percentile(quantiles_theoretical, 1)),
            'Q5_Deviation': np.abs(np.percentile(row, 5) - np.percentile(quantiles_theoretical, 5)),
            'Q25_Deviation': np.abs(np.percentile(row, 25) - np.percentile(quantiles_theoretical, 25)),
            'Q50_Deviation': np.abs(np.percentile(row, 50) - np.percentile(quantiles_theoretical, 50)),
            'Q75_Deviation': np.abs(np.percentile(row, 75) - np.percentile(quantiles_theoretical, 75)),
            'Q95_Deviation': np.abs(np.percentile(row, 95) - np.percentile(quantiles_theoretical, 95)),
            'Q99_Deviation': np.abs(np.percentile(row, 99) - np.percentile(quantiles_theoretical, 99)),
            'AD_Stat': ad_stat,
            'Line_Fit_R_Squared': line_fit_r_squared
        })
    return pd.DataFrame(features)


In [130]:
features_df = extract_qq_features(df_values)

In [109]:
features_df

Unnamed: 0,Mean_Deviation,Max_Deviation,Std_Deviation,Q1_Deviation,Q5_Deviation,Q25_Deviation,Q50_Deviation,Q75_Deviation,Q95_Deviation,Q99_Deviation
0,2.500000e+01,29.265735,0.999939,27.325987,26.644768,25.674474,25.0,24.325526,23.355232,22.674013
1,1.611945e+04,26674.230146,1142.707783,9944.325987,15406.644768,15980.674474,16408.0,16486.325526,16820.355232,16902.674013
2,7.088813e+03,19460.734265,4963.807017,49.325987,259.644768,2865.674474,6448.0,10795.325526,15984.555232,18766.914013
3,-1.000000e+00,2.174731,0.997022,1.300306,0.638456,0.326671,1.0,1.673329,2.638456,3.300306
4,-7.500000e-01,-0.001851,0.478511,0.023060,0.107895,0.532069,1.0,1.217931,1.042105,1.006940
...,...,...,...,...,...,...,...,...,...,...
5792,1.010798e+00,11.171712,0.998744,3.324022,2.644300,1.674390,1.0,0.325610,0.644300,1.324022
5793,-2.539779e-16,3.828288,0.999665,2.324022,1.644300,0.674390,0.0,0.674390,1.644300,2.324022
5794,-2.539779e-16,3.828288,0.999665,2.324022,1.644300,0.674390,0.0,0.674390,1.644300,2.324022
5795,1.000372e+00,4.828288,0.998571,3.324022,2.644300,1.674390,1.0,0.325610,0.644300,1.324022


In [111]:
mask = features_df.isnull().any(axis=1)
df_target = df_target[~mask] # drop corresponding test target with null features

features_df.dropna(inplace=True) # drop test with null features

In [123]:
from sklearn.model_selection import train_test_split

columns_to_drop = ['Target', 'Name', 'Distribution Type', 'Target_Encoded']
X = features_df
y = df_target['Distribution Type'].apply(lambda x: 1 if x in ['outlier', 'longtail'] else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [117]:
y.value_counts()

0    4148
1    1646
Name: Distribution Type, dtype: int64

In [124]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [125]:
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9473684210526315
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96       827
           1       0.92      0.89      0.91       332

    accuracy                           0.95      1159
   macro avg       0.94      0.93      0.93      1159
weighted avg       0.95      0.95      0.95      1159



In [126]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

from sklearn.ensemble import RandomForestClassifier

model_smote = RandomForestClassifier()

model_smote.fit(X_train_smote, y_train_smote)

y_pred_smote = model_smote.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_smote))
print("Classification Report:\n", classification_report(y_test, y_pred_smote))

Accuracy: 0.9499568593615185
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       827
           1       0.91      0.91      0.91       332

    accuracy                           0.95      1159
   macro avg       0.94      0.94      0.94      1159
weighted avg       0.95      0.95      0.95      1159



In [None]:
import pandas as pd
import matplotlib as plt

importances = model_smote.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X_train_smote.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=True)

import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance from Random Forest')
plt.show()