In [1]:
from path import Path
import pandas as pd

In [2]:
data = 'clean_data.csv'
df = pd.read_csv(data)
df.head(10)

Unnamed: 0,id_registro,uci,neumonia,diabetes,epoc,asma,cardiovascular,inmusupr,hipertension,renal_cronica,tabaquismo
0,z2eace,1,1,1,1,1,1,1,1,1,1
1,z3c500,1,0,1,1,1,1,1,0,1,1
2,z39e04,1,0,1,1,1,1,1,1,1,0
3,z50698,1,0,0,1,1,1,1,1,1,1
4,1c4536,1,1,1,1,1,1,1,1,1,1
5,8-Feb-00,1,1,1,1,1,1,1,1,1,1
6,060aa5,0,0,1,1,1,1,1,1,1,1
7,1c250e,1,0,0,1,1,1,1,0,1,1
8,01094d,1,0,1,1,1,1,1,1,1,1
9,46071,1,0,1,1,1,1,1,1,1,1


In [3]:
df.count()

id_registro       96118
uci               96118
neumonia          96118
diabetes          96118
epoc              96118
asma              96118
cardiovascular    96118
inmusupr          96118
hipertension      96118
renal_cronica     96118
tabaquismo        96118
dtype: int64

 ## Separate the Features (X) from the Target (y)

In [4]:
y = df["uci"]
y=y.astype('int')
X = df.drop(columns=["uci", "id_registro"])
X.dtypes

neumonia          int64
diabetes          int64
epoc              int64
asma              int64
cardiovascular    int64
inmusupr          int64
hipertension      int64
renal_cronica     int64
tabaquismo        int64
dtype: object

 ## Split our data into training and testing

In [5]:
from sklearn.model_selection import train_test_split
from collections import Counter

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
Counter(y_train)

Counter({1: 66432, 0: 5656})

 ## Create a Logistic Regression Model

In [6]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

 ## Fit (train) or model using the training data

In [7]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

 ## Make predictions

In [8]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,0
8,1,1
9,1,1


In [9]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9215147732001665


In [10]:
from sklearn.metrics import confusion_matrix, classification_report
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,1886
Actual 1,0,22144


In [11]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1886
           1       0.92      1.00      0.96     22144

    accuracy                           0.92     24030
   macro avg       0.46      0.50      0.48     24030
weighted avg       0.85      0.92      0.88     24030



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.5

In [13]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.00      0.00      1.00      0.00      0.00      0.00      1886
          1       0.92      1.00      0.00      0.96      0.00      0.00     22144

avg / total       0.85      0.92      0.08      0.88      0.00      0.00     24030



  _warn_prf(average, modifier, msg_start, len(result))


## Over sampling

In [14]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [15]:
classifier.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=1)

In [16]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,1
1,0,1
2,0,1
3,1,1
4,1,1
5,1,1
6,0,1
7,0,0
8,1,1
9,1,1


In [17]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.5445276737411568


In [18]:
from sklearn.metrics import confusion_matrix, classification_report
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1504,382
Actual 1,10563,11581


In [19]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.12      0.80      0.22      1886
           1       0.97      0.52      0.68     22144

    accuracy                           0.54     24030
   macro avg       0.55      0.66      0.45     24030
weighted avg       0.90      0.54      0.64     24030



In [20]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.660220420737837

In [21]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.12      0.80      0.52      0.22      0.65      0.43      1886
          1       0.97      0.52      0.80      0.68      0.65      0.41     22144

avg / total       0.90      0.54      0.78      0.64      0.65      0.41     24030



## SMOTE

In [22]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

In [23]:
classifier.fit(X_resampled, y_resampled)

y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)

print(accuracy_score(y_test, y_pred))

0.5439450686641698


In [25]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1507,379
Actual 1,10580,11564


In [26]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.12      0.80      0.22      1886
           1       0.97      0.52      0.68     22144

    accuracy                           0.54     24030
   macro avg       0.55      0.66      0.45     24030
weighted avg       0.90      0.54      0.64     24030



In [27]:
balanced_accuracy_score(y_test, y_pred)

0.6606319036220646

In [28]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.12      0.80      0.52      0.22      0.65      0.43      1886
          1       0.97      0.52      0.80      0.68      0.65      0.41     22144

avg / total       0.90      0.54      0.78      0.64      0.65      0.41     24030



## Under sampling

In [29]:
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 5656, 1: 5656})

In [32]:
classifier.fit(X_resampled, y_resampled)

y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)

print(accuracy_score(y_test, y_pred))

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df


0.5446941323345817


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1504,382
Actual 1,10559,11585


In [33]:

report = classification_report(y_test, y_pred)
print(report)

balanced_accuracy_score(y_test, y_pred)

print(classification_report_imbalanced(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.12      0.80      0.22      1886
           1       0.97      0.52      0.68     22144

    accuracy                           0.54     24030
   macro avg       0.55      0.66      0.45     24030
weighted avg       0.90      0.54      0.64     24030

                   pre       rec       spe        f1       geo       iba       sup

          0       0.12      0.80      0.52      0.22      0.65      0.43      1886
          1       0.97      0.52      0.80      0.68      0.65      0.41     22144

avg / total       0.90      0.54      0.78      0.64      0.65      0.41     24030

