In [27]:
import numpy as np
import os
import pandas as pd
import os
import glob
import numpy as np
import importlib
from matplotlib import colors
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

def fit_line(x, y):
    # given one dimensional x and y vectors - return x and y for fitting a line on top of the regression
    # inspired by the numpy manual - https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html 
    x = x.to_numpy() # convert into numpy arrays
    y = y.to_numpy() # convert into numpy arrays

    A = np.vstack([x, np.ones(len(x))]).T # sent the design matrix using the intercepts
    m, c = np.linalg.lstsq(A, y, rcond=None)[0]

    return m, c

data = pd.read_csv("ned-d-complete.csv")

data.dropna()

# A bit of filtering
data = data[data['D (Mpc)'] > 0]
data = data[data['D (Mpc)'] < 100]
data = data[data['z'] > 0]
data = data.filter(['Galaxy ID', 'z', 'D (Mpc)'], axis=1)

data.columns = ['Name', 'Velocity', 'MPC']

data.Velocity = data.Velocity.abs() * 299792
data = data[data['Velocity'] < 15000]
print("Rows: {}".format(len(data)))

# plot the data as a scatter plot
fig = px.scatter(x=data['MPC'], y=data['Velocity'], opacity=.5, labels={"x" : "Distance (MPC)", "y" : "Velocity (Km/s)"}) 

# fit a linear model 
m, c = fit_line(x = data['MPC'], y = data['Velocity'])

# add the linear fit on top
fig.add_trace(
    go.Scatter(
        x=data['MPC'],
        y=m*data['MPC'] + c,
        mode="lines",
        line=go.scatter.Line(color="red"),
        showlegend=False)
)
# optionally you can show the slop and the intercept 
mid_point = data['MPC'].mean()
fig.update_layout(
    showlegend=False,
    annotations=[
        go.layout.Annotation(
            x=mid_point,
            y=m*mid_point + c,
            xref="x",
            yref="y",
            text=str(round(m, 4))+'x+'+str(round(c, 2)) ,
        )
    ]
)
fig.show()

Rows: 2993
