Preprocessing for Screening Stage Cancer vs healthy

Import operating system, numpy, pandas, matplotlib, statsmodels, random libraries

In [16]:
import numpy as np 
import pandas as pd 
import matplotlib as mp 
import statsmodels.api as sm 
import random

Set seed to ensure reproducability

In [17]:
random.seed(123)


Read train and test data and visualise data

In [18]:
trainp_data = pd.read_csv('Train_Set.csv')
testp_data = pd.read_csv('Test_Set.csv')
print(trainp_data.head(5)) # information on first five data points
print(testp_data.head(5)) # information on first five data points
print(trainp_data.shape) # dimensions of train data set
print(testp_data.shape) # dimensions of test data set
print(trainp_data.describe()) # statistical information on train data set
print(testp_data.describe()) # statistical information on test data set


   length_51  length_52  length_53  length_54  length_55  length_56  \
0   0.000152   0.000081   0.000087   0.000092   0.000099   0.000125   
1   0.006256   0.006413   0.006512   0.006469   0.006810   0.007070   
2   0.003783   0.003886   0.004063   0.004237   0.004481   0.004832   
3   0.004635   0.004471   0.004383   0.004829   0.004920   0.005056   
4   0.011315   0.010045   0.009795   0.009906   0.010630   0.011467   

   length_57  length_58  length_59  length_60  ...  length_392  length_393  \
0   0.000122   0.000115   0.000151   0.000167  ...    0.007396    0.007193   
1   0.007748   0.008088   0.008671   0.008835  ...    0.017830    0.017033   
2   0.004960   0.005605   0.005919   0.006480  ...    0.010957    0.010481   
3   0.005475   0.006158   0.007174   0.007697  ...    0.010032    0.008933   
4   0.013024   0.014853   0.016874   0.017501  ...    0.009118    0.008535   

   length_394  length_395  length_396  length_397  length_398  length_399  \
0    0.006973    0.006481  

Check for missing data points in train and test data

In [19]:
print(trainp_data.isna().sum()) # check how many data points missing
print(trainp_data.iloc[:, -1]) # view response variable (status of cancer)
print(trainp_data['class_label'].value_counts()) # distribution of response variable (status of cancer)
print(testp_data.isna().sum()) # check how many data points missing
print(testp_data.iloc[:, -1]) # view response variable (status of cancer)
print(testp_data['class_label'].value_counts()) # distribution of response variable (status of cancer)

length_51      0
length_52      0
length_53      0
length_54      0
length_55      0
              ..
length_397     0
length_398     0
length_399     0
length_400     0
class_label    0
Length: 351, dtype: int64
0                  healthy
1                  healthy
2                  healthy
3                  healthy
4                  healthy
               ...        
2188    early stage cancer
2189    early stage cancer
2190    early stage cancer
2191    early stage cancer
2192    early stage cancer
Name: class_label, Length: 2193, dtype: object
class_label
early stage cancer        781
screening stage cancer    490
mid stage cancer          453
late stage cancer         409
healthy                    60
Name: count, dtype: int64
length_51      0
length_52      0
length_53      0
length_54      0
length_55      0
              ..
length_397     0
length_398     0
length_399     0
length_400     0
class_label    0
Length: 351, dtype: int64
0                 healthy
1               

Remove irrelevant classes from train and test data that are not either healthy or screening stage cancer

In [20]:
trainp_data = trainp_data[(trainp_data['class_label'] == 'healthy') | (trainp_data['class_label'] == 'screening stage cancer')] # remove non he;lathy and non screening stage cancer classes
trainp_data.reset_index(drop=True, inplace=True)
print(trainp_data['class_label'].value_counts()) 
testp_data = trainp_data[(testp_data['class_label'] == 'healthy') | 
(testp_data['class_label'] == 'screening stage cancer')] # remove non he;lathy and non screening stage cancer classes
testp_data.reset_index(drop=True, inplace=True)
print(testp_data['class_label'].value_counts()) 

class_label
screening stage cancer    490
healthy                    60
Name: count, dtype: int64
class_label
screening stage cancer    100
healthy                    46
Name: count, dtype: int64


  testp_data = trainp_data[(testp_data['class_label'] == 'healthy') |


Initialise x and y variables 

In [21]:
xvals_train = trainp_data.iloc[:, 0:350] # access train x values
yvals_train = trainp_data.iloc[:,350] # access train y values
xvals_test = testp_data.iloc[:, 0:350] # access test x values
yvals_test = testp_data.iloc[:, 350] # access test x values

Standardising x values in train and test dataset

In [22]:
from sklearn.preprocessing import StandardScaler 
std_x = StandardScaler() # initialise StandardScaler
x_std = std_x.fit_transform(xvals_train) # standardise x values
x_test_std = std_x.fit_transform(xvals_test) # standardise

Technique to remove regressors:

Perform logistic regression to identify insignifcant regressors based on p value

In [None]:
yvals_binary = yvals_train.map({'screening stage cancer': 1, 'healthy': 0}) # Convert "screening stage cancer" to 1 and "healthy" to 0
print(yvals_binary.unique()) 
logit_model = sm.Logit(yvals_binary, x_std) # carry out logistic regression
result = logit_model.fit()
p_values = result.pvalues # obtain p values of regressor features in model
significant_features_1 = list(p_values[p_values < 0.05].index.tolist()) # Filter out significant features with p-value < 0.05
insignificant_features_1 = list(p_values[p_values > 0.05].index.tolist()) # Filter out insignificant features with p-value > 0.05
print(len(insignificant_features_1) + len(significant_features_1))
print(insignificant_features_1)

In [None]:

### Perform logistic regression to identify insignifcant regressors based on p value
yvals_binary = yvals.map({'screening stage cancer': 1, 'healthy': 0}) # Convert "screening stage cancer" to 1 and "healthy" to 0
print(yvals_binary.unique()) # double confirm correct conversion of the y values 
logit_model = sm.Logit(yvals_binary, x_std) # carry out logistic regression
result = logit_model.fit()
print(result.summary()) # summary of model
p_values = result.pvalues # obtain p values of regressor features in model
significant_features_1 = list(p_values[p_values < 0.05].index.tolist()) # Filter out significant features with p-value < 0.05
insignificant_features_1 = list(p_values[p_values > 0.05].index.tolist()) # Filter out insignificant features with p-value > 0.05

### Perform L1 (lasso) regularisation to logistic regression model to idenitify insignifcant regressors with coefficients zero
from sklearn.linear_model import LogisticRegression
log_reg_l1 = LogisticRegression(penalty='l1', solver='liblinear')  # specify solver for L1 regularization
log_reg_l1.fit(x_std, yvals_binary)  # train model
significant_features_2 = list(xvals.columns[log_reg_l1.coef_[0] != 0]) # check if there are any signifcant features identified
insignificant_features_2 = list(xvals.columns[log_reg_l1.coef_[0] == 0]) # identify insignifcant features

### Perform principal component analysis to identify insignifcant regressors that contrivbute little to variance
from sklearn.decomposition import PCA 
pca = PCA() # initialise PCA with default parameters
x_pca = pca.fit_transform(x_std) # compute principal components and transforms data into new feature space
explained_variance_ratio = pca.explained_variance_ratio_  # Get explained variance ratio for each component
cumulative_explained_variance_ratio = np.cumsum(explained_variance_ratio) # sum computed variance for certain number of components
import matplotlib.pyplot as plt
plt.plot(cumulative_explained_variance_ratio) # plot cumulative explained variance ratio
plt.grid(True) # add grid to the plot
plt.show() # show graph

chosen_num_components = 25  # plateu cannot be read accurately so limit x axis to [0,25] and plot again
plt.plot(range(1, chosen_num_components + 1), cumulative_explained_variance_ratio[:chosen_num_components])
plt.grid(True) # add grid to the plot
plt.show() # plateu can be read 

chosen_num_components = 3 # there is little to no change in cumulative variance when more than 3 principal components are involved in calculations
pca = PCA(n_components = chosen_num_components) # initialise new pca with no of components = 3
x_pca = pca.fit_transform(x_std) # compute principal components and transforms data into new feature space

original_features = xvals.columns # all features
significant_feature_indices = np.where(cumulative_explained_variance_ratio <= 0.95)[0] # identify festures that are responsible for 95 oercent of variance
significant_features_3 = list(original_features[significant_feature_indices])
insignificant_feature_indices = np.where(cumulative_explained_variance_ratio > 0.95)[0]
insignificant_features_3 = list(original_features[insignificant_feature_indices])

### Perform correlation analysis to identify insignifcant regressors that are are highly correlated
correlation_matrix = xvals.corr() # find correlation coefficients between each pair of variables
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool)) # mask to consider only the upper triangular matrix and ignore correlation between the same variable. Output is true and false
correlation_matrix = correlation_matrix.mask(mask) # apply mask to correlation matrix
significant_features_4 = list(xvals.columns)
insignificant_features_4 = []
 
for i in range(len(correlation_matrix.columns)): # Loop through the columnns
    for j in range(i): # Loop through the rows
        if abs(correlation_matrix.iloc[i, j]) > 0.8:  # Adjust the threshold as needed. Set 0.8 here
            colname_i = correlation_matrix.columns[i] # first compared feature
            colname_j = correlation_matrix.columns[j] # second compared feature
            if colname_i in significant_features_4:
                significant_features_4.remove(colname_i) # update the significant features
            if colname_i not in insignificant_features_4:
                insignificant_features_4.append(colname_i) # update the siugnifcant features

### Decide which features to remove

### check lengths of features to be removed
removables = {}
for feature in insignificant_features_1:
    if feature not in removables:
        removables[feature] = 0
    removables[feature] += 1

for feature in insignificant_features_2:
    if feature not in removables:
        removables[feature] = 0
    removables[feature] += 1

for feature in insignificant_features_3:
    if feature not in removables:
        removables[feature] = 0
    removables[feature] += 1

for feature in insignificant_features_4:
    if feature not in removables:
        removables[feature] = 0
    removables[feature] += 1

len(xvals.columns) # no of features in\ dataset 

drops_2 = []
for feature in removables:
    if removables[feature] >= 2:
        drops_2.append(feature)
len(drops_2) # 349 variables removed # 1 variables preserved

drops_3 = []
for feature in removables:
    if removables[feature] >= 3:
        drops_3.append(feature)
len(drops_3) # 333 variables removed # 17 variables preserved

drops_4 = []
for feature in removables:
    if removables[feature] >= 4:
        drops_4.append(feature)
len(drops_4) # 0 variables remnoved # 350 variables preserved 

### drops_3 selected to prevent underfitting or overfitting of the model
xvals_filtered = xvals.drop(columns=drops_3)
xvals_filtered.shape

# xvals_filtered to be used as dependent variables used to build the model
# yvals to be used as the response variable used to build the model




(2193, 351)
0                  healthy
1                  healthy
2                  healthy
3                  healthy
4                  healthy
               ...        
2188    early stage cancer
2189    early stage cancer
2190    early stage cancer
2191    early stage cancer
2192    early stage cancer
Name: class_label, Length: 2193, dtype: object
class_label
early stage cancer        781
screening stage cancer    490
mid stage cancer          453
late stage cancer         409
healthy                    60
Name: count, dtype: int64
Shape of filtered data: (550, 351)
[0 1]


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


         Current function value: inf
         Iterations: 35


LinAlgError: Singular matrix

Technique to remove regressors:

Perform logistic regression to identify insignifcant regressors based on p value