In [1]:
# ***************************************************************************************************
# This is just a Python code without any Qiskit function or quantum circuit.
# Here you will find a simple version of classical machine learning algorithm for binary classification.
# ***************************************************************************************************

# ***************************************************************************************************
# Code from:
# Zickert, F. (2021). Chapter 2: Binary Classification. 
# In F. Zickert, Hands-On Quantum Machine Learning With Python: Volume 1: Get Started, PyQML, 2021. 
# Available on https://www.pyqml.com/
# ***************************************************************************************************

# ***************************************************************************************************
# Modifications have been made by Gustavo Patino.
# Engineering School
# University of Antioquia.
# Medellin, Colombia
# September, 2022.
# ***************************************************************************************************

In [2]:
# ***************************************************
## 1. Look at the data
# ***************************************************

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
# Importing the Qiskit Lab. Drive:
import sys
sys.path.insert(0, '..')

In [5]:
# Listing 2.1: Load the data from the csv‐files
import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [6]:
# Listing 2.2: The shapes of the Titanic datasets
print('train has {} rows and {} columns'.format(*train.shape))
print('test has {} rows and {} columns'.format(*test.shape))

train has 891 rows and 12 columns
test has 418 rows and 11 columns


In [7]:
# Listing 2.3: The structure of the train dataset
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
# Listing 2.4: The structure of the test dataset
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [9]:
# Listing 2.5: Look at the data
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# ***************************************************
## 2. Data Preparation and Cleaning
# ***************************************************

In [10]:
# Listing 2.6: Cope with missing values
# option 1
# We only have two passengers without it. This is bearable
train = train.dropna(subset=["Embarked"])

# option 2
# We only have very few information about the cabin, let's drop it
train = train.drop("Cabin", axis=1) 

# option 3
# The age misses quite a few times. But intuition
# says it might be important for someone's chance to survive.
mean = train["Age"].mean()
train["Age"] = train["Age"].fillna(mean)

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


In [11]:
## 2.1 Deleting unuseful information

In [12]:
# Listing 2.7: Unique values in columns
print('There are {} different (unique) PassengerIds in the data'
    .format(train["PassengerId"].nunique()))
print('There are {} different (unique) names in the data'
    .format(train["Name"].nunique()))
print('There are {} different (unique) ticket numbers in the data'
    .format(train["Ticket"].nunique()))

There are 889 different (unique) PassengerIds in the data
There are 889 different (unique) names in the data
There are 680 different (unique) ticket numbers in the data


In [13]:
# Listing 2.8: Remove identifying data
train = train.drop("PassengerId", axis=1)
train = train.drop("Name", axis=1) 
train = train.drop("Ticket", axis=1) 

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


In [14]:
# Listing 2.9: Transforming textual data into numbers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in ['Sex', 'Embarked']:
    le.fit(train[col])
    train[col] = le.transform(train[col])

train.head()



Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [15]:
# Listing 2.10: The maximum values
print('The maximum age is {}'.format(train["Age"].max()))
print('The maximum fare is {}'.format(train["Fare"].max()))


The maximum age is 80.0
The maximum fare is 512.3292


In [16]:
# Listing 2.11: Normalization of the data.
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train)
train = scaler.transform(train)

print('The minimum value is {} and the maximum value is {}'
    .format(train.min(), train.max()))

The minimum value is 0.0 and the maximum value is 1.0


In [17]:
print(train)

[[0.         1.         1.         ... 0.         0.01415106 1.        ]
 [1.         0.         0.         ... 0.         0.13913574 0.        ]
 [1.         1.         0.         ... 0.         0.01546857 1.        ]
 ...
 [0.         1.         0.         ... 0.33333333 0.04577135 1.        ]
 [1.         0.         1.         ... 0.         0.0585561  0.        ]
 [0.         1.         1.         ... 0.         0.01512699 0.5       ]]


In [18]:
################################################
## Separating train and test sets
################################################

In [19]:
# Listing 2.12: Separating input from labels and training from testing sets
from sklearn.model_selection import train_test_split

input_data = train[:, 1:8]
labels = train[:, 0]

train_input, test_input, train_labels, test_labels = train_test_split(
    input_data, labels, test_size =0.2)

print('We have {} training and {} testing rows'.format(train_input.shape[0], test_input.shape[0]))
print('There are {} input columns'.format(train_input.shape[1]))

We have 711 training and 178 testing rows
There are 7 input columns


In [20]:
# Listing 2.13: Save the data to the filesystem
import numpy as np

with open('data/train.npy', 'wb') as f:
    np.save(f, train_input)
    np.save(f, train_labels)

with open('data/test.npy', 'wb') as f:
    np.save(f, test_input)
    np.save(f, test_labels)
#CAPTION Save the data to the filesystem

In [21]:
print(train_input)

[[0.5        1.         0.33400352 ... 0.         0.02537431 1.        ]
 [0.         1.         0.22090978 ... 0.         0.21255864 0.        ]
 [1.         0.         0.36720398 ... 0.         0.03025399 0.5       ]
 ...
 [0.5        1.         0.58532295 ... 0.         0.02927805 1.        ]
 [1.         0.         0.17064589 ... 0.         0.02194234 0.        ]
 [1.         0.         0.36720398 ... 0.33333333 0.0436405  0.        ]]


In [None]:
################################################
# 3. Firts Classifiers
## 3.1 A Random Classifier
################################################

In [22]:
# Listing 2.14: A random classifier
import random
random.seed(a=None, version=2)
 
def ramd_classify(passenger):
    return random.randint(0, 1)

In [23]:
# Listing 2.15: The classification runner
def run(f_classify, x):
    return list(map(f_classify, x))

In [24]:
# Listing 2.16: Run the classifier
result = run(ramd_classify, train_input)
print(result)

[1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 

In [25]:
# Listing 2.17: Evaluate the classifier
def evaluate(predictions, actual):
  correct = list(filter(
      lambda item: item[0] == item[1],
      list(zip(predictions, actual))
    ))
  return '{} correct predictions out of {}. Accuracy {:.0f} %' \
        .format(len(correct), len(actual), 100*len(correct)/len(actual))

print(evaluate(run(ramd_classify, train_input), train_labels))

351 correct predictions out of 711. Accuracy 49 %


In [26]:
### 3.2 Predicting all dead - I'm too pessimist! 

In [27]:
# Listing 2.18: Always predict a passenger died
def predict_death(item):
    return 0

print(evaluate(run(predict_death, train_input), train_labels))

440 correct predictions out of 711. Accuracy 62 %


In [28]:
################################################
## 4. Classifier Evaluation and Measures
################################################

In [29]:
# Listing 2.19: Confustion matrix of the predict death classifier
from sklearn.metrics import confusion_matrix

predictions = run(predict_death, train_input)
confusion_matrix(train_labels, predictions)

array([[440,   0],
       [271,   0]])

In [30]:
# Listing 2.20: The precision score
from sklearn.metrics import precision_score
print('The precision score of the predict_death classifier is {}'
    .format(precision_score(train_labels, predictions)))

The precision score of the predict_death classifier is 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# Listing 2.21: The recall score
from sklearn.metrics import recall_score
print('The recall score of the predict_death classifier is {}'
    .format(recall_score(train_labels, predictions)))

The recall score of the predict_death classifier is 0.0


In [32]:
# Listing 2.22: The specificity and the npv
def specificity(matrix):
    return matrix[0][0]/(matrix[0][0]+matrix[0][1]) if (matrix[0][0]+matrix[0][1] > 0) else 0

def npv(matrix):
    return matrix[0][0]/(matrix[0][0]+matrix[1][0]) if (matrix[0][0]+matrix[1][0] > 0) else 0

cm = confusion_matrix(train_labels, predictions)

print('The specificity score of the predict_death classifier is {:.2f}'.format(specificity(cm)))
print('The npv score of the predict_death classifier is {:.2f}'.format(npv(cm)))

The specificity score of the predict_death classifier is 1.00
The npv score of the predict_death classifier is 0.62


In [33]:
# Listing 2.23: The scores of the random classifier
random_predictions = run(ramd_classify, train_input)
random_cm = confusion_matrix(train_labels, random_predictions)
print('The Confusion Matrix for the random classification is:')
print(random_cm)

print('The precision score of the random classifier is {:.2f}'
    .format(precision_score(train_labels, random_predictions)))
print('The recall score of the random classifier is {:.2f}'
    .format(recall_score(train_labels, random_predictions)))
print('The specificity score of the random classifier is {:.2f}'
    .format(specificity(random_cm)))
print('The npv score of the random classifier is {:.2f}'
    .format(npv(random_cm)))

The Confusion Matrix for the random classification is:
[[231 209]
 [126 145]]
The precision score of the random classifier is 0.41
The recall score of the random classifier is 0.54
The specificity score of the random classifier is 0.53
The npv score of the random classifier is 0.65


In [None]:
################################################
## 5. Giving the Classification Report
################################################

In [34]:
# Listing 2.31: A reusable function to unmask the classifier
def classifier_report(name, run, classify, input, labels):
    cr_predictions = run(classify, input)
    cr_cm = confusion_matrix(labels, cr_predictions)

    cr_precision = precision_score(labels, cr_predictions)
    cr_recall = recall_score(labels, cr_predictions)
    cr_specificity = specificity(cr_cm)
    cr_npv = npv(cr_cm)
    cr_level = 0.25*(cr_precision + cr_recall + cr_specificity + cr_npv)

    print('The precision score of the {} classifier is {:.2f}'
        .format(name, cr_precision))
    print('The recall score of the {} classifier is {:.2f}'
        .format(name, cr_recall))
    print('The specificity score of the {} classifier is {:.2f}'
        .format(name, cr_specificity))
    print('The npv score of the {} classifier is {:.2f}'
        .format(name, cr_npv))
    print('The information level is: {:.2f}'
        .format(cr_level))

In [35]:
# Listing 2.32: The report of the random classifier
classifier_report(
    "Random Classifier", 
    run,
    ramd_classify,
    train_input,
    train_labels)

The precision score of the Random Classifier classifier is 0.35
The recall score of the Random Classifier classifier is 0.47
The specificity score of the Random Classifier classifier is 0.47
The npv score of the Random Classifier classifier is 0.59
The information level is: 0.47


In [36]:
# Listing 2.32: The report of the random classifier
classifier_report(
    "all dead", 
    run,
    predict_death,
    train_input,
    train_labels)

The precision score of the all dead classifier is 0.00
The recall score of the all dead classifier is 0.00
The specificity score of the all dead classifier is 1.00
The npv score of the all dead classifier is 0.62
The information level is: 0.40


  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
import platform,socket,re,uuid,json,psutil,logging

def getSystemInfo():
    try:
        info={}
        info['platform']=platform.system()
        info['platform-release']=platform.release()
        info['platform-version']=platform.version()
        info['Python-version']=platform.python_version()
        info['architecture']=platform.machine()
        info['hostname']=socket.gethostname()
        info['ip-address']=socket.gethostbyname(socket.gethostname())
        info['mac-address']=':'.join(re.findall('..', '%012x' % uuid.getnode()))
        info['processor']=platform.processor()
        info['ram']=str(round(psutil.virtual_memory().total / (1024.0 **3)))+" GB"
        return json.dumps(info)
    except Exception as e:
        logging.exception(e)

json.loads(getSystemInfo())

{'platform': 'Linux',
 'platform-release': '4.15.0-191-generic',
 'platform-version': '#202-Ubuntu SMP Thu Aug 4 01:49:29 UTC 2022',
 'Python-version': '3.8.13',
 'architecture': 'x86_64',
 'hostname': 'jupyter-5ae8705e0f020500393111cc',
 'ip-address': '172.30.243.100',
 'mac-address': '92:51:5d:49:53:72',
 'processor': 'x86_64',
 'ram': '31 GB'}