In [None]:
import sys
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [None]:
preprocessor_dir = 'modules/preprocessors'
sys.path.append(preprocessor_dir)

In [None]:
seed = 42

### German Credit Risk

In [None]:
from preprocess_german_credit_data import preprocess_german_credit_data

In [None]:
german_credit_dir = 'data/german_credit_data.csv'

In [None]:
df_train, df_test, label_transformers, metric_transformers = preprocess_german_credit_data(german_credit_dir, 0.3, False, seed)
df_train.head()

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=seed)
rf.fit(np.asarray(df_train.drop(columns=['risk'])), np.asarray(df_train['risk']))

In [None]:
y_pred = rf.predict(df_test.drop('risk', axis=1))
tn, fp, fn, tp = confusion_matrix(np.asarray(df_test['risk']), y_pred).ravel()
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_pred)

print('fp rate:', fp_rate)
print('fn rate:', fn_rate)
print('precision:', precision)
print('recall:', recall)
print('accuracy:', accuracy)

In [None]:
df_train['risk'].value_counts()/len(df_train)

In [None]:
df_train, df_test, label_transformers, metric_transformers = preprocess_german_credit_data(german_credit_dir, 0.3, True, seed)
df_train.head()

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=seed)
rf.fit(np.asarray(df_train.drop(columns=['risk'])), np.asarray(df_train['risk']))

In [None]:
y_pred = rf.predict(df_test.drop('risk', axis=1))
tn, fp, fn, tp = confusion_matrix(np.asarray(df_test['risk']), y_pred).ravel()
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_pred)

print('fp rate:', fp_rate)
print('fn rate:', fn_rate)
print('precision:', precision)
print('recall:', recall)
print('accuracy:', accuracy)

In [None]:
df_train['risk'].value_counts()/len(df_train)

### Adult

In [None]:
from preprocess_adult_data import preprocess_adult_data

In [None]:
adult_dir = 'data/adult.csv'

In [None]:
df_train, df_test, label_transformers, metric_transformers = preprocess_adult_data(adult_dir, 0.3, False, seed)
df_train.head()

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=seed)
rf.fit(np.asarray(df_train.drop(columns=['income'])), np.asarray(df_train['income']))

In [None]:
y_pred = rf.predict(df_test.drop('income', axis=1))
tn, fp, fn, tp = confusion_matrix(np.asarray(df_test['income']), y_pred).ravel()
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_pred)

print('fp rate:', fp_rate)
print('fn rate:', fn_rate)
print('precision:', precision)
print('recall:', recall)
print('accuracy:', accuracy)

In [None]:
df_train['income'].value_counts()/len(df_train)

In [None]:
df_train, df_test, label_transformers, metric_transformers = preprocess_adult_data(adult_dir, 0.3, True, seed)
df_train.head()

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=seed)
rf.fit(np.asarray(df_train.drop(columns=['income'])), np.asarray(df_train['income']))

In [None]:
y_pred = rf.predict(df_test.drop('income', axis=1))
tn, fp, fn, tp = confusion_matrix(np.asarray(df_test['income']), y_pred).ravel()
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_pred)

print('fp rate:', fp_rate)
print('fn rate:', fn_rate)
print('precision:', precision)
print('recall:', recall)
print('accuracy:', accuracy)

In [None]:
df_train['income'].value_counts()/len(df_train)

### Diabetes

In [None]:
from preprocess_diabetes_data import preprocess_diabetes_data

In [None]:
diabetes_dir = 'data/diabetes.csv'

In [None]:
df_train, df_test, label_transformers, metric_transformers = preprocess_diabetes_data(diabetes_dir, 0.3, False, seed)
df_train.head()

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=seed)
rf.fit(np.asarray(df_train.drop(columns=['diabetes'])), np.asarray(df_train['diabetes']))

In [None]:
y_pred = rf.predict(df_test.drop('diabetes', axis=1))
tn, fp, fn, tp = confusion_matrix(np.asarray(df_test['diabetes']), y_pred).ravel()
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_pred)

print('fp rate:', fp_rate)
print('fn rate:', fn_rate)
print('precision:', precision)
print('recall:', recall)
print('accuracy:', accuracy)

In [None]:
df_train['diabetes'].value_counts()/len(df_train)

In [None]:
df_train, df_test, label_transformers, metric_transformers = preprocess_diabetes_data(diabetes_dir, 0.3, True, seed)
df_train.head()

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=seed)
rf.fit(np.asarray(df_train.drop(columns=['diabetes'])), np.asarray(df_train['diabetes']))

In [None]:
y_pred = rf.predict(df_test.drop('diabetes', axis=1))
tn, fp, fn, tp = confusion_matrix(np.asarray(df_test['diabetes']), y_pred).ravel()
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_pred)

print('fp rate:', fp_rate)
print('fn rate:', fn_rate)
print('precision:', precision)
print('recall:', recall)
print('accuracy:', accuracy)

In [None]:
df_train['diabetes'].value_counts()/len(df_train)

### Heart

In [None]:
from preprocess_heart_data import preprocess_heart_data

In [None]:
heart_dir = 'data/heart.csv'

In [None]:
df_train, df_test, label_transformers, metric_transformers = preprocess_heart_data(heart_dir, 0.3, False, seed)
df_train.head()

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=seed)
rf.fit(np.asarray(df_train.drop(columns=['disease'])), np.asarray(df_train['disease']))

In [None]:
y_pred = rf.predict(df_test.drop('disease', axis=1))
tn, fp, fn, tp = confusion_matrix(np.asarray(df_test['disease']), y_pred).ravel()
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_pred)

print('fp rate:', fp_rate)
print('fn rate:', fn_rate)
print('precision:', precision)
print('recall:', recall)
print('accuracy:', accuracy)

In [None]:
df_train['disease'].value_counts()/len(df_train)

In [None]:
df_train, df_test, label_transformers, metric_transformers = preprocess_heart_data(heart_dir, 0.3, True, seed)
df_train.head()

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=seed)
rf.fit(np.asarray(df_train.drop(columns=['disease'])), np.asarray(df_train['disease']))

In [None]:
y_pred = rf.predict(df_test.drop('disease', axis=1))
tn, fp, fn, tp = confusion_matrix(np.asarray(df_test['disease']), y_pred).ravel()
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_pred)

print('fp rate:', fp_rate)
print('fn rate:', fn_rate)
print('precision:', precision)
print('recall:', recall)
print('accuracy:', accuracy)

In [None]:
df_train['disease'].value_counts()/len(df_train)

### Mushrooms

In [None]:
from preprocess_mushrooms_data import preprocess_mushrooms_data

In [None]:
mushrooms_dir = 'data/mushrooms.csv'

In [None]:
df_train, df_test, label_transformers, metric_transformers = preprocess_mushrooms_data(mushrooms_dir, 0.3, False, seed)
df_train.head()

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=seed)
rf.fit(np.asarray(df_train.drop(columns=['target'])), np.asarray(df_train['target']))

In [None]:
y_pred = rf.predict(df_test.drop('target', axis=1))
tn, fp, fn, tp = confusion_matrix(np.asarray(df_test['target']), y_pred).ravel()
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_pred)

print('fp rate:', fp_rate)
print('fn rate:', fn_rate)
print('precision:', precision)
print('recall:', recall)
print('accuracy:', accuracy)

In [None]:
df_train['target'].value_counts()/len(df_train)

In [None]:
df_train, df_test, label_transformers, metric_transformers = preprocess_mushrooms_data(mushrooms_dir, 0.3, True, seed)
df_train.head()

In [None]:
rf = RandomForestClassifier(class_weight='balanced', random_state=seed)
rf.fit(np.asarray(df_train.drop(columns=['target'])), np.asarray(df_train['target']))

In [None]:
y_pred = rf.predict(df_test.drop('target', axis=1))
tn, fp, fn, tp = confusion_matrix(np.asarray(df_test['target']), y_pred).ravel()
fp_rate = fp / (fp + tn)
fn_rate = fn / (fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / len(y_pred)

print('fp rate:', fp_rate)
print('fn rate:', fn_rate)
print('precision:', precision)
print('recall:', recall)
print('accuracy:', accuracy)

In [None]:
df_train['target'].value_counts()/len(df_train)