In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", 119)

from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
def fetch_data(drop_some=True):

    df = pd.read_csv("abnormal_writeout.data.csv")

    if drop_some:
        # trascurare da ACC a UVM
        start_drop = df.columns.get_loc("ACC")
        end_drop = df.columns.get_loc("UVM")
        cols = np.arange(start_drop, end_drop + 1)
        df.drop(df.columns[cols], axis=1, inplace=True)

        # trascurare old_phylo_factor e la prima colonna
        df.drop("oldest_phylostratum_factor", axis=1, inplace=True)
        df.drop("Unnamed: 0", axis=1, inplace=True)

    return df

In [3]:
df.info()

NameError: name 'df' is not defined

In [None]:
sns.heatmap(df.corr())

In [None]:
df = fetch_data()
df # 85 columns remain.

In [None]:
num_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),  # Remove null values
        # ("std_scaler", StandardScaler()),  # Feature scaling
    ]
)

In [None]:
# ! Cambiare in dropna

In [None]:
def clean_data(df):
    labels = df["response"]
    features = df.drop(labels="response", axis=1)

    df_clean = pd.DataFrame(num_pipeline.fit_transform(features), columns=features.columns)
    df_clean["response"] = labels

    return df_clean


def separate_data(df):
    resp = df["response"]
    occ = df["occ_total_sum"]
    age = df["oldest_phylostratum"]
    conf = df.drop(labels=["response", "occ_total_sum", "oldest_phylostratum"], axis=1)
    return occ, age, conf, resp

In [None]:
df_clean = clean_data(df)
X_occ, X_age, X_conf, Y = separate_data(df_clean)

In [None]:
pca = PCA() 
pca.fit(X_conf) # confounding Features
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print("Using {} principal components.".format(d))
pca_apply = PCA(n_components=d)

X_conf_PCA = pca_apply.fit_transform(X_conf) # Apply PCA with d components
sns.heatmap(pd.DataFrame(X_conf_PCA).corr())

In [None]:
# Logistic Regression with variable features

# occ_total_sum
X = X_occ.to_numpy().reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(class_weight = 'balanced', max_iter=500)
log_reg.fit(x_train, y_train)
predictions = log_reg.predict(x_test)

print("Single feature: occ_total_sum")
print("Predictions:\t", predictions[0:30])
print("Labels:\t\t", y_test.values[0:30])

log_acc = accuracy_score(y_test, predictions)
predictions_cont = log_reg.predict_log_proba(x_test)
log_auc = roc_auc_score(y_test, predictions_cont[:, 1]) 

print("Accuracy of", log_acc)
print("AUC:", log_auc)
print("------------------------")

# Age 
X = X_age.to_numpy().reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(class_weight = 'balanced', max_iter=500)
log_reg.fit(x_train, y_train)
predictions = log_reg.predict(x_test)

print("Single feature: oldest_phylostratum")
print("Predictions:\t", predictions[0:30])
print("Labels:\t\t", y_test.values[0:30])

log_acc = accuracy_score(y_test, predictions)
predictions_cont = log_reg.predict_log_proba(x_test)
log_auc = roc_auc_score(y_test, predictions_cont[:, 1]) 

print("Accuracy of", log_acc)
print("AUC:", log_auc)
print("------------------------")

# Two features 
X = np.c_[X_occ, X_age]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(class_weight = 'balanced', max_iter=500)
log_reg.fit(x_train, y_train)
predictions = log_reg.predict(x_test)

print("Multiple feature: occ_total_sum + oldest_phylostratum")
print("Predictions:\t", predictions[0:30])
print("Labels:\t\t", y_test.values[0:30])

log_acc = accuracy_score(y_test, predictions)
predictions_cont = log_reg.predict_log_proba(x_test)
log_auc = roc_auc_score(y_test, predictions_cont[:, 1]) 

print("Accuracy of", log_acc)
print("AUC:", log_auc)
print("------------------------")

# Confounders 
X = np.c_[X_occ, X_age, X_conf]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(class_weight = 'balanced', max_iter=500)
log_reg.fit(x_train, y_train)
predictions = log_reg.predict(x_test)

print("Multiple feature: All features")
print("Predictions:\t", predictions[0:30])
print("Labels:\t\t", y_test.values[0:30])

log_acc = accuracy_score(y_test, predictions)
predictions_cont = log_reg.predict_log_proba(x_test)
log_auc = roc_auc_score(y_test, predictions_cont[:, 1]) 

print("Accuracy of", log_acc)
print("AUC:", log_auc)
print("------------------------")

# Confounders PCA 

X = np.c_[X_occ, X_age, X_conf_PCA]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(class_weight = 'balanced', max_iter=500)
log_reg.fit(x_train, y_train)
predictions = log_reg.predict(x_test)

print("Multiple feature: All features with PCA")
print("Predictions:\t", predictions[0:30])
print("Labels:\t\t", y_test.values[0:30])

log_acc = accuracy_score(y_test, predictions)
predictions_cont = log_reg.predict_log_proba(x_test)
log_auc = roc_auc_score(y_test, predictions_cont[:, 1]) 

print("Accuracy of", log_acc)
print("AUC:", log_auc)
print("------------------------")

In [None]:
# Logistic Regression with variable features

# occ_total_sum
X = X_occ.to_numpy().reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=500)
log_reg.fit(x_train, y_train)
predictions = log_reg.predict(x_test)

print("Single feature: occ_total_sum")
print("Predictions:\t", predictions[0:30])
print("Labels:\t\t", y_test.values[0:30])

log_acc = accuracy_score(y_test, predictions)
predictions_cont = log_reg.predict_log_proba(x_test)
log_auc = roc_auc_score(y_test, predictions_cont[:, 1]) 

print("Accuracy of", log_acc)
print("AUC:", log_auc)
print("------------------------")

# Age 
X = X_age.to_numpy().reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=500)
log_reg.fit(x_train, y_train)
predictions = log_reg.predict(x_test)

print("Single feature: oldest_phylostratum")
print("Predictions:\t", predictions[0:30])
print("Labels:\t\t", y_test.values[0:30])

log_acc = accuracy_score(y_test, predictions)
predictions_cont = log_reg.predict_log_proba(x_test)
log_auc = roc_auc_score(y_test, predictions_cont[:, 1]) 

print("Accuracy of", log_acc)
print("AUC:", log_auc)
print("------------------------")

# Two features 
X = np.c_[X_occ, X_age]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=500)
log_reg.fit(x_train, y_train)
predictions = log_reg.predict(x_test)

print("Multiple feature: occ_total_sum + oldest_phylostratum")
print("Predictions:\t", predictions[0:30])
print("Labels:\t\t", y_test.values[0:30])

log_acc = accuracy_score(y_test, predictions)
predictions_cont = log_reg.predict_log_proba(x_test)
log_auc = roc_auc_score(y_test, predictions_cont[:, 1]) 

print("Accuracy of", log_acc)
print("AUC:", log_auc)
print("------------------------")

# Confounders 
X = np.c_[X_occ, X_age, X_conf]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=500)
log_reg.fit(x_train, y_train)
predictions = log_reg.predict(x_test)

print("Multiple feature: All features")
print("Predictions:\t", predictions[0:30])
print("Labels:\t\t", y_test.values[0:30])

log_acc = accuracy_score(y_test, predictions)
predictions_cont = log_reg.predict_log_proba(x_test)
log_auc = roc_auc_score(y_test, predictions_cont[:, 1]) 

print("Accuracy of", log_acc)
print("AUC:", log_auc)
print("------------------------")

# Confounders PCA 

X = np.c_[X_occ, X_age, X_conf_PCA]

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=500)
log_reg.fit(x_train, y_train)
predictions = log_reg.predict(x_test)

print("Multiple feature: All features with PCA")
print("Predictions:\t", predictions[0:30])
print("Labels:\t\t", y_test.values[0:30])

log_acc = accuracy_score(y_test, predictions)
predictions_cont = log_reg.predict_log_proba(x_test)
log_auc = roc_auc_score(y_test, predictions_cont[:, 1]) 

print("Accuracy of", log_acc)
print("AUC:", log_auc)
print("------------------------")

In [None]:
log_reg.