In [1]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
import joblib

### Retrieve AIDS Group Study Dataset

In [2]:
# fetch dataset 
aids_clinical_trials_group_study_175 = fetch_ucirepo(id=890) 
  
# data (as pandas dataframes) 
X = aids_clinical_trials_group_study_175.data.features 
y = aids_clinical_trials_group_study_175.data.targets 

In [3]:
# metadata 
print(aids_clinical_trials_group_study_175.metadata) 
  
# variable information 
print(aids_clinical_trials_group_study_175.variables) 

{'uci_id': 890, 'name': 'AIDS Clinical Trials Group Study 175', 'repository_url': 'https://archive.ics.uci.edu/dataset/890/aids+clinical+trials+group+study+175', 'data_url': 'https://archive.ics.uci.edu/static/public/890/data.csv', 'abstract': 'The AIDS Clinical Trials Group Study 175 Dataset contains healthcare statistics and categorical information about patients who have been diagnosed with AIDS. This dataset was initially published in 1996. The prediction task is to predict whether or not each patient died within a certain window of time or not. ', 'area': 'Life Science', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 2139, 'num_features': 23, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Sexual Orientation', 'Race', 'Gender'], 'target_col': ['cid'], 'index_col': ['pidnum'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1996, 'last_updated': 'Wed Sep 27 2023

In [4]:
print(X)

      time  trt  age      wtkg  hemo  homo  drugs  karnof  oprior  z30  ...  \
0      948    2   48   89.8128     0     0      0     100       0    0  ...   
1     1002    3   61   49.4424     0     0      0      90       0    1  ...   
2      961    3   45   88.4520     0     1      1      90       0    1  ...   
3     1166    3   47   85.2768     0     1      0     100       0    1  ...   
4     1090    0   43   66.6792     0     1      0     100       0    1  ...   
...    ...  ...  ...       ...   ...   ...    ...     ...     ...  ...  ...   
2134  1091    3   21   53.2980     1     0      0     100       0    1  ...   
2135   395    0   17  102.9672     1     0      0     100       0    1  ...   
2136  1104    2   53   69.8544     1     1      0      90       0    1  ...   
2137   465    0   14   60.0000     1     0      0     100       0    0  ...   
2138  1045    3   45   77.3000     1     0      0     100       0    0  ...   

      gender  str2  strat  symptom  treat  offtrt  

In [5]:
print(y)

      cid
0       0
1       1
2       0
3       0
4       0
...   ...
2134    0
2135    0
2136    0
2137    1
2138    0

[2139 rows x 1 columns]


### Preprocess and Split Data

In [6]:
X_matrix = X.to_numpy()
scaler = preprocessing.StandardScaler().fit(X_matrix)
X_scaled = scaler.transform(X_matrix)
X_scaled

array([[ 0.2357988 ,  0.42496   ,  1.46454203, ...,  0.73092701,
        -0.87615103, -1.37425994],
       [ 0.42059994,  1.31177897,  2.95759528, ..., -1.06020741,
        -1.23858648, -0.83477895],
       [ 0.28028796,  1.31177897,  1.11999128, ..., -0.6729351 ,
         2.24204361,  2.15259704],
       ...,
       [ 0.76966875,  0.42496   ,  2.03879328, ..., -0.05053318,
         0.84229291,  0.23743952],
       [-1.4171447 , -1.34867794, -2.44036645, ..., -1.39907067,
         0.02577167,  2.02896598],
       [ 0.5677564 ,  1.31177897,  1.11999128, ...,  3.86368335,
        -0.21168604, -0.92019677]])

In [7]:
y_vec = y.to_numpy().reshape((len(y), ))
y_vec

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [8]:
n_examples = len(X)
p = 0.8 # percentage of data used in training phase
X_train, X_test = X_scaled[:int(p * n_examples)], X_scaled[int(p * n_examples):]
y_train, y_test = y_vec[:int(p * n_examples)], y_vec[int(p * n_examples):]

### Train Logistic Regression Model

In [None]:
# Train logistic regression model 
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

In [None]:
def get_accuracy(model, X, y):
    n_correct = (model.predict(X) == y).sum()
    return 1.0 * n_correct / len(X)

accuracy = get_accuracy(clf, X_test, y_test)
print(f"Accuracy on full dataset: {100 * accuracy:0.2f}%")

Accuracy on full dataset: 83.18%


In [None]:
joblib.dump(clf, "clf.joblib")

['clf.joblib']