In [5]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

import sklearn
from sklearn import preprocessing, svm 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 

In [6]:
# Load the data
train_data = '..\data\semen_analysis_data.csv' 
train_df = pd.read_csv(train_data, sep=";")

# drop rows with NaN values
train_df = train_df.dropna()
# print(train_df.head())

participant_data = '..\data\participant_related_data.csv' 
participant_df = pd.read_csv(participant_data, sep=";")

# drop rows with NaN values
participant_df = participant_df.dropna()
# print(participant_df.head())

data_df = pd.merge(train_df, participant_df, on='ID', how='inner')
print(data_df.head)

<bound method NDFrame.head of     ID  Sperm concentration (x10⁶/mL)  Total sperm count (x10⁶)  \
0    1                          105.3                     363.1   
1    2                           45.4                     141.6   
2    3                           50.0                     124.5   
3    4                           42.9                     103.8   
4    5                           32.8                     158.7   
..  ..                            ...                       ...   
80  81                           62.0                     280.9   
81  82                           22.0                     157.3   
82  83                          148.0                     478.0   
83  84                           43.0                     180.6   
84  85                          184.0                     496.8   

    Ejaculate volume (mL)  Sperm vitality (%)  Normal spermatozoa (%)  \
0                     3.5                  81                     2.0   
1                  

In [7]:
for index, col in enumerate(data_df.columns):
    print(f"IND {index}: {col} ")

IND 0: ID 
IND 1: Sperm concentration (x10⁶/mL) 
IND 2: Total sperm count (x10⁶) 
IND 3: Ejaculate volume (mL) 
IND 4: Sperm vitality (%) 
IND 5: Normal spermatozoa (%) 
IND 6: Head defects (%) 
IND 7: Midpiece and neck defects (%) 
IND 8: Tail defects (%) 
IND 9: Cytoplasmic droplet (%) 
IND 10: Teratozoospermia index 
IND 11: Progressive motility (%) 
IND 12: Non progressive sperm motility (%) 
IND 13: Immotile sperm (%) 
IND 14: High DNA stainability. HDS (%) 
IND 15: DNA fragmentation index. DFI (%) 
IND 16: Abstinence time(days) 
IND 17: Body mass index (kg/m²) 
IND 18: Age (years) 


In [8]:
# Train the model

# sperm vitality, progressive motility, non-progressive sperm motiility, and immorility 
    # are all related to motility (which we want to predict)
    # These correspond to indices 4, 11, 12, and 13

columns = data_df.columns
exclude = [4, 11, 12, 13]
motility_related_columns = columns[exclude]
lab_measurements = [columns[i] for i in np.arange(len(columns)) if i not in exclude ]

# print(motility_related_columns)
# print(lab_measurements)

measurements = data_df[lab_measurements]
values_to_predict = data_df[motility_related_columns]
progressive_motility = data_df[columns[11]]


In [9]:
import sklearn.linear_model

np.random.seed(13)


# Do the train-test split
X_train, X_test, y_train, y_test = train_test_split(measurements, progressive_motility, test_size = 0.2) 

max_iters = [100, 150, 200, 500, 750, 1000, 1500, 2000, 2500]

for max_iter in max_iters:

    regr_Hubert = sklearn.linear_model.HuberRegressor(max_iter=max_iter)
    regr_ARD = sklearn.linear_model.ARDRegression(max_iter=max_iter)
    regr_Bayes = sklearn.linear_model.BayesianRidge(max_iter=max_iter)
    regr_Elastic = sklearn.linear_model.ElasticNet(max_iter=max_iter)

    regr_Hubert.fit(X_train, y_train)
    regr_ARD.fit(X_train, y_train)
    regr_Bayes.fit(X_train, y_train)
    regr_Elastic.fit(X_train, y_train)

    print(f"Max iter {max_iter} for HubertRegressor: Score: {regr_Hubert.score(X_test, y_test)}")
    print(f"Max iter {max_iter} for ARDRegressor:  Score: {regr_ARD.score(X_test, y_test)}")
    print(f"Max iter {max_iter} for BayesRegressor:  Score: {regr_Bayes.score(X_test, y_test)}")
    print(f"Max iter {max_iter} for ElasticRegressor:  Score: {regr_Elastic.score(X_test, y_test)}")


    print()

Max iter 100 for HubertRegressor: Score: 0.5991545900083523
Max iter 100 for ARDRegressor:  Score: 0.4510455988191896
Max iter 100 for BayesRegressor:  Score: 0.36717108864165127
Max iter 100 for ElasticRegressor:  Score: 0.3271446151858969

Max iter 150 for HubertRegressor: Score: 0.43113888994312455
Max iter 150 for ARDRegressor:  Score: 0.4510455988191896
Max iter 150 for BayesRegressor:  Score: 0.36717108864165127
Max iter 150 for ElasticRegressor:  Score: 0.3271446151858969

Max iter 200 for HubertRegressor: Score: 0.3751246526248294
Max iter 200 for ARDRegressor:  Score: 0.4510455988191896
Max iter 200 for BayesRegressor:  Score: 0.36717108864165127
Max iter 200 for ElasticRegressor:  Score: 0.3271446151858969

Max iter 500 for HubertRegressor: Score: 0.3562496527324194
Max iter 500 for ARDRegressor:  Score: 0.4510455988191896
Max iter 500 for BayesRegressor:  Score: 0.36717108864165127
Max iter 500 for ElasticRegressor:  Score: 0.3271446151858969



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Max iter 750 for HubertRegressor: Score: 0.364470142162739
Max iter 750 for ARDRegressor:  Score: 0.4510455988191896
Max iter 750 for BayesRegressor:  Score: 0.36717108864165127
Max iter 750 for ElasticRegressor:  Score: 0.3271446151858969

Max iter 1000 for HubertRegressor: Score: 0.364470142162739
Max iter 1000 for ARDRegressor:  Score: 0.4510455988191896
Max iter 1000 for BayesRegressor:  Score: 0.36717108864165127
Max iter 1000 for ElasticRegressor:  Score: 0.3271446151858969

Max iter 1500 for HubertRegressor: Score: 0.364470142162739
Max iter 1500 for ARDRegressor:  Score: 0.4510455988191896
Max iter 1500 for BayesRegressor:  Score: 0.36717108864165127
Max iter 1500 for ElasticRegressor:  Score: 0.3271446151858969

Max iter 2000 for HubertRegressor: Score: 0.364470142162739
Max iter 2000 for ARDRegressor:  Score: 0.4510455988191896
Max iter 2000 for BayesRegressor:  Score: 0.36717108864165127
Max iter 2000 for ElasticRegressor:  Score: 0.3271446151858969

Max iter 2500 for Hubert

HubertRgressor converges for approx 750 max iters. It obtains a score of 0.36.
The ARDRegressor performs better with a score of 0.45

It is still off from the target of score >0.85 

We try to do some normalization

In [10]:
# Normalize the data using a MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(data_df)

normalized_data = scaler.transform(data_df)

print(normalized_data.shape)

(85, 19)


In [11]:

# sperm vitality, progressive motility, non-progressive sperm motiility, and immorility 
    # are all related to motility (which we want to predict)
    # These correspond to indices 4, 11, 12, and 13

columns = data_df.columns
exclude = [4, 11, 12, 13]
motility_related_columns = exclude
lab_measurements = [i for i in np.arange(len(columns)) if i not in exclude ]

# print(motility_related_columns)
# print(lab_measurements)

measurements = normalized_data[:, lab_measurements]
values_to_predict = normalized_data[:, motility_related_columns]
progressive_motility = normalized_data[:, 11]


In [12]:
np.random.seed(13)


# Do the train-test split
X_train, X_test, y_train, y_test = train_test_split(measurements, progressive_motility, test_size = 0.2) 

max_iters = [100, 150, 200, 500, 750, 1000, 1500, 2000, 2500]

for max_iter in max_iters:

    regr_Hubert = sklearn.linear_model.HuberRegressor(max_iter=max_iter)
    regr_ARD = sklearn.linear_model.ARDRegression(max_iter=max_iter)
    regr_Bayes = sklearn.linear_model.BayesianRidge(max_iter=max_iter)
    regr_Elastic = sklearn.linear_model.ElasticNet(max_iter=max_iter)

    regr_Hubert.fit(X_train, y_train)
    regr_ARD.fit(X_train, y_train)
    regr_Bayes.fit(X_train, y_train)
    regr_Elastic.fit(X_train, y_train)

    print(f"Max iter {max_iter} for HubertRegressor: Score: {regr_Hubert.score(X_test, y_test)}")
    print(f"Max iter {max_iter} for ARDRegressor:  Score: {regr_ARD.score(X_test, y_test)}")
    print(f"Max iter {max_iter} for BayesRegressor:  Score: {regr_Bayes.score(X_test, y_test)}")
    print(f"Max iter {max_iter} for ElasticRegressor:  Score: {regr_Elastic.score(X_test, y_test)}")


    print()

Max iter 100 for HubertRegressor: Score: 0.11069918877164542
Max iter 100 for ARDRegressor:  Score: 0.45109971418330086
Max iter 100 for BayesRegressor:  Score: 0.35566330697011816
Max iter 100 for ElasticRegressor:  Score: -0.0012086104836011025

Max iter 150 for HubertRegressor: Score: 0.11085500307760454
Max iter 150 for ARDRegressor:  Score: 0.45109971418330086
Max iter 150 for BayesRegressor:  Score: 0.35566330697011816
Max iter 150 for ElasticRegressor:  Score: -0.0012086104836011025

Max iter 200 for HubertRegressor: Score: 0.11085500307760454
Max iter 200 for ARDRegressor:  Score: 0.45109971418330086
Max iter 200 for BayesRegressor:  Score: 0.35566330697011816
Max iter 200 for ElasticRegressor:  Score: -0.0012086104836011025

Max iter 500 for HubertRegressor: Score: 0.11085500307760454
Max iter 500 for ARDRegressor:  Score: 0.45109971418330086
Max iter 500 for BayesRegressor:  Score: 0.35566330697011816
Max iter 500 for ElasticRegressor:  Score: -0.0012086104836011025

Max iter

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
