In [167]:
import pickle

with open('dataframes.pkl', 'rb') as file:
    data = pickle.load(file)

yk_full = data['yk_full']
yk_dropped = data['yk_dropped']

In [168]:
yk_full.shape

(1025, 21)

In [169]:
yk_dropped.shape

(902, 21)

In [170]:
import pandas as pd 

yk_dropped['Distribution Type'].value_counts()

longtail      264
normal        255
outlier       239
bimodal        78
functional     37
discrete       29
Name: Distribution Type, dtype: int64

In [171]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

columns_to_drop = ['Target', 'Name', 'Distribution Type']
X = yk_dropped.drop(columns = columns_to_drop)
y = yk_dropped['Distribution Type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [172]:
# converting target label to classify outliers only 
y_train_outlier = y_train.apply(lambda x: 1 if x == 'outlier' else 0)
y_test_outlier = y_test.apply(lambda x: 1 if x == 'outlier' else 0)

In [173]:
model_outlier = RandomForestClassifier()

model_outlier.fit(X_train, y_train_outlier)

y_pred_outlier = model_outlier.predict(X_test)
y_proba_outlier = model_outlier.predict_proba(X_test)

In [174]:
# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test_outlier, y_pred_outlier))
print("Classification Report:\n", classification_report(y_test_outlier, y_pred_outlier))

Accuracy: 0.9558011049723757
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97       136
           1       0.95      0.87      0.91        45

    accuracy                           0.96       181
   macro avg       0.95      0.93      0.94       181
weighted avg       0.96      0.96      0.96       181



In [175]:
# converting target label to classify longtail only 
y_train_longtail = y_train.apply(lambda x: 1 if x == 'longtail' else 0)
y_test_longtail = y_test.apply(lambda x: 1 if x == 'longtail' else 0)

In [176]:
model_longtail = RandomForestClassifier()

model_longtail.fit(X_train, y_train_longtail)

y_pred_longtail = model_longtail.predict(X_test)
y_proba_longtail = model_longtail.predict_proba(X_test)

In [177]:
# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test_longtail, y_pred_longtail))
print("Classification Report:\n", classification_report(y_test_longtail, y_pred_longtail))

Accuracy: 0.8784530386740331
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.95      0.91       121
           1       0.88      0.73      0.80        60

    accuracy                           0.88       181
   macro avg       0.88      0.84      0.86       181
weighted avg       0.88      0.88      0.88       181



In [178]:
final_predictions = []

outlier_preds = []
longtail_preds = []
either_preds = []

outlier = model_outlier.predict(X_test)
longtail = model_longtail.predict(X_test)

for i in range(len(outlier)):
    outlier_pred = outlier[i]
    outlier_preds.append(outlier_pred)

    longtail_pred = longtail[i]
    longtail_preds.append(longtail_pred)

    if outlier_pred + longtail_pred >= 1:
        either_preds.append(1)
    else:
        either_preds.append(0)
    
predictions = pd.DataFrame({
    'Outlier': outlier_preds,
    'Longtail': longtail_preds,
    'Either': either_preds
})

In [179]:
predictions

Unnamed: 0,Outlier,Longtail,Either
0,0,1,1
1,1,0,1
2,0,0,0
3,1,0,1
4,0,0,0
...,...,...,...
176,1,0,1
177,0,0,0
178,1,0,1
179,0,0,0


In [181]:
# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score

y_test_either = y_test.apply(lambda x: 1 if x in ['outlier', 'longtail'] else 0)
y_pred_either = predictions['Either']

print("Accuracy:", accuracy_score(y_test_either, y_pred_either))
print("Classification Report:\n", classification_report(y_test_either, y_pred_either))

Accuracy: 0.8784530386740331
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.95      0.87        76
           1       0.96      0.83      0.89       105

    accuracy                           0.88       181
   macro avg       0.88      0.89      0.88       181
weighted avg       0.89      0.88      0.88       181



In [182]:
from xgboost import XGBClassifier

# Example of setting class weights
model = XGBClassifier(scale_pos_weight=10)  # Increase weight for the minority class
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5], got ['bimodal' 'discrete' 'functional' 'longtail' 'normal' 'outlier']

In [None]:
# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.861878453038674
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.89      0.90       121
           1       0.79      0.80      0.79        60

    accuracy                           0.86       181
   macro avg       0.84      0.85      0.84       181
weighted avg       0.86      0.86      0.86       181

