<a href="https://colab.research.google.com/github/drskprabhakar/Axial-Length-Prediction-ML-Regression-mode/blob/main/Axial_Length_Prediction_ML_Regression_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

### Loading dataset from google drive

In [2]:
df=pd.read_excel('/content/drive/MyDrive/Axial length prediction.xlsx')

In [None]:
df.shape

### Exploring the dataset

In [None]:
# find first 5 rows and columns
df.head()

In [None]:
# find last 5 rows and columns
df.tail()

In [None]:
# find descriptive statistics
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# plotting scatter plot for visualising data distribution of corneal curvatures before outliers removal
plt.scatter(df.KH, df.KV, df.AL)
plt.xlabel('Corneal curvatures(Diopter)')
plt.ylabel('Axial length(millimeter)')
plt.title('Dataset scatter plot')

In [None]:
# creating Box and Whiskers plots for corneal curvatures
df.boxplot(['KH','KV'])

In [None]:
# creating Box and Whiskers plots for axial length
df.boxplot(['AL'])

### Outliers detection and removal

In [None]:
Q1 = df.AL.quantile(0.25)
Q2 = df.AL.quantile(0.50)
Q3 = df.AL.quantile(0.75)
Q1, Q2,Q3

In [None]:
IQR = Q3-Q1
IQR

In [None]:
lower_limit = Q1-1.5*IQR
upper_limit = Q3+1.5*IQR
lower_limit, upper_limit

In [None]:
df[(df.AL<lower_limit)|(df.AL>upper_limit)]

In [None]:
df_no_outliers = df[(df.AL>lower_limit)&(df.AL<upper_limit)]
df_no_outliers

In [None]:
df_no_outliers.describe()

In [None]:
# plotting scatter plot for visualising data distribution of corneal curvatures after outliers removal
plt.scatter(df_no_outliers.KH, df_no_outliers.KV, df_no_outliers.AL)
plt.xlabel('Corneal curvatures(Diopter)')
plt.ylabel('Axial length(millimeter)')
plt.title('Dataset scatter plot')

In [None]:
df_no_outliers.boxplot('AL')

In [None]:
df_no_outliers.describe()

### Heat map construction for correlation matrix

In [None]:
# constucting heat map for correlation matrix
sns.heatmap(df_no_outliers.corr())

In [None]:
df_no_outliers.corr()

In [None]:
plt.figure(figsize = (6, 4))
x = df.KH
plt.hist(x, bins = 10, color = "blue")
plt.title("Horizontal corneal curvature data distribution")
plt.xlabel("KH in Diopter")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize = (6, 4))
x = df.KV
plt.hist(x, bins = 10, color = "blue")
plt.title("Vertical corneal curvature data distribution")
plt.xlabel("KV in Diopter")
plt.ylabel("Frequency")
plt.show()


In [None]:
plt.figure(figsize = (6, 4))
x = df.AL
plt.hist(x, bins = 10, color = "red")
plt.title("Axial length data distribution")
plt.xlabel("AL in Millimeter")
plt.ylabel("Frequency")
plt.show()

### Defining X and y as independant and dependant variable

In [11]:
X = df_no_outliers.drop(['AL'], axis=1).values
y = df_no_outliers['AL'].values

In [12]:
X=df.drop(['AL'], axis=1).values
y=df['AL'].values

### Spliting the dataset into training and testing set

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_true = train_test_split(X,y,test_size=0.3, random_state=42)

### Fitting the models for training by Linear reegression, Support vector machine algorithm, Decision tree algorithm and Random forest algorithm

In [14]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train,y_train)

In [25]:
from os import supports_bytes_environ
from sklearn.svm import SVR
supportv_model = SVR()
supportv_model.fit(X_train, y_train)

In [36]:
from sklearn.tree import DecisionTreeRegressor
decisiont_model = DecisionTreeRegressor()
decisiont_model.fit(X_train, y_train)

In [27]:
from sklearn.ensemble import RandomForestRegressor
randomf_model = RandomForestRegressor()
randomf_model.fit(X_train, y_train)

### Models prediction on Testing set

In [None]:
# linear regression model predictions on testing set
actual = y_true
predicted = linear_model.predict(X_test)
predicted

In [None]:
# Support vector machine regression model predictions on testing set
actual = y_true
predicted = supportv_model.predict(X_test)
predicted

In [None]:
# Decision tree regression model predictions on testing set
actual = y_true
predicted = decisiont_model.predict(X_test)
predicted

In [None]:
# Random forest regression model predictions on testing set
actual = y_true
predicted = randomf_model.predict(X_test)
predicted

## Evaluate the model for the Accuracy

In [62]:
# accuracy of the linear regression model
from sklearn.metrics import r2_score
r2_score(actual, predicted)

0.9774846109126626

In [63]:
# Accuracy of support vector regression model
actual = y_true
predicted = supportv_model.predict(X_test)
r2_score(actual, predicted)

0.9774846109126626

In [64]:
# Accuracy of Decision tree regression model
actual = y_true
predicted = decisiont_model.predict(X_test)
r2_score(actual, predicted)

0.9885010609156307

In [65]:
# Accuracy of random forest regression model
actual = y_true
predicted = randomf_model.predict(X_test)
r2_score(actual, predicted)

0.9914351054817864

### Deducing coefficient and intercept for multiple linear regression equation

In [61]:
print(linear_model.coef_), print(linear_model.intercept_)

[0.37822037 0.10207555]
1.2778288750639


(None, None)

In [58]:
from sklearn import metrics
metrics.mean_absolute_error(actual, predicted)

0.07502324681717798

In [59]:
metrics.mean_squared_error(actual, predicted)

0.014728579262149736

In [60]:
np.sqrt(metrics.mean_squared_error(actual, predicted))

0.12136135819176438

### Cross validation of each model on one example from the original dataset

In [32]:
# Cross validation of linear regression model
linear_model.predict([[47.25,	47.0]])

array([23.94629229])

In [33]:
# Cross validation of support vector machine regression model
supportv_model.predict([[47.25,	47.0]])

array([23.83443495])

In [39]:
# Cross validation of Decision tree regression model
decisiont_model.predict([[47.25,	47.0]])

array([23.9])

In [34]:
# Cross validation of random forest regression model
randomf_model.predict([[47.25,	47.0]])

array([23.87356])

In [None]:
df_no_outliers.tail()

## Plot the results

In [None]:
plt.figure(figsize=(6,5))
plt.scatter(actual,predicted)
plt.xlabel('Actual axial length')
plt.ylabel('Predicted axial length')
plt.title('Actual vs Predicted axial length')

In [None]:
plt.figure(figsize = (6, 4))
plt.hist(actual-predicted, bins=10, color = "green")
plt.title("Residuals (actual-predicted axial length values")
plt.xlabel("Difference values")
plt.ylabel("Frequency")
plt.show()


### Predictive model equation
y=m1X1+m2X3+b

Predicted AL= 0.38*KH+0.10KV+1.28

