## Linear Regression and RANSAC

In [None]:
# Load the modules we need
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model, datasets

In [None]:
#  Create some intial data
n_samples = 1000

X, y = datasets.make_regression(n_samples=n_samples, n_features=1,
                                      n_informative=1, noise=10,
                                      random_state=0)

In [None]:
# Add some outlier data
np.random.seed(0)
n_outliers = 50
X[:n_outliers] = 3 + 0.5 * np.random.normal(size=(n_outliers, 1))
y[:n_outliers] = -3 + 10 * np.random.normal(size=n_outliers)

In [None]:
# Create some test positions for for the lines we will draw
line_X = np.arange(X.min(), X.max())[:, np.newaxis]

In [None]:
class LinearRegression(object):
    def __init__(self):
        self.m = None
        self.c = None
    def fit(self, X, Y):
        X = X.flatten()
        #calculate all elements of the system of linear equations that has to be solved
        #the solution minimizes the least squares objective function
        alpha = np.dot(X, Y)
        beta = np.dot(X, X)
        gamma = np.sum(X)
        delta = np.sum(Y)
        N = len(X)
        #coefficient matrix M
        M = np.array([[beta, gamma],
                      [gamma, N]])
        #right hand side of system of linear equation
        y = np.array([alpha, delta])
        #solution consisting of slope m and offset c 
        sol = np.linalg.solve(M, y)
        self.m, self.c = sol
    def predict(self, X):
        X = X.flatten()
        y = self.m*X + self.c
        return y
        

In [None]:
# This is the cell you need to replace to complete Task 2.
# Fit line using all data.
lr = LinearRegression()
lr.fit(X, y)
line_y = lr.predict(line_X)
print(line_y)

In [None]:
class RANSACRegressor(object):
    def __init__(self,thresh, max_trials=100):
        self.max_trials = max_trials
        self.thresh = thresh
        self.m = None
        self.c = None
        self.inlier_mask_ = None
        self.outlier_mask_ = None
    def dist_from_line(self, m, c, X, Y):
        #shortest distance between a linear function defined by slope m and offset c and a point (X,Y)
        return np.sqrt((Y-(m*X+c))**2/(1+m**2))
    def fit(self, X, Y):
        inliers = 0
        best_fit = None
        X = X.flatten()
        Y = Y.flatten()
        for i in range(self.max_trials):
            #choose two points at random
            chosen_indices=np.random.randint(0, len(X),size=2)
            chosen_X, chosen_Y = X[chosen_indices], Y[chosen_indices]
            #calculate slope and offset of line that intersects both points
            m = (chosen_Y[1] - chosen_Y[0])/(chosen_X[1]-chosen_X[0])
            c = chosen_Y[0] - m*chosen_X[0]
            #calculate all shortest distances of all points to the line
            dists = self.dist_from_line(m, c, X, Y)
            #points for which the distances are smaller than the threshold are classified as inliers
            inlier_mask = dists<=self.thresh
            #calculates the amount of inliers
            new_inliers = np.sum(inlier_mask)
            if new_inliers>inliers:
                #if new inliers exceed previous best, update slope and offset values
                inliers = new_inliers
                self.m, self.c = m, c
        #calculate inlier and outlier mask of the optimal solution that was found
        best_dists = self.dist_from_line(self.m, self.c, X, Y)
        self.inlier_mask_ = best_dists<=self.thresh
        self.outlier_mask_ = best_dists>self.thresh
    def predict(self, X):
        X = X.flatten()
        y = self.m*X + self.c
        return y    

In [None]:
# This is the cell you need to replace to complete Task 3.
# Robustly fit linear model with RANSAC algorithm
ransac = RANSACRegressor(thresh=1/2)
ransac.fit(X, y)

# Predict data of estimated models
line_y_ransac = ransac.predict(line_X)
print(line_y_ransac)

# Label the if source data is inlier or outlier
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

In [None]:
lw = 2

doing_linear = True
doing_ransac = True
if (doing_linear):
    plt.plot(line_X, line_y, color='navy', linewidth=lw, label='Linear regressor')

if (doing_ransac):
    plt.scatter(X[inlier_mask], y[inlier_mask], color='yellowgreen', marker='.', label='Inliers')
    plt.scatter(X[outlier_mask], y[outlier_mask], color='gold', marker='.', label='Outliers')
    plt.plot(line_X, line_y_ransac, color='cornflowerblue', linewidth=lw, label='RANSAC regressor')
else:
    plt.scatter(X, y, color='yellowgreen', marker='.', label='Data')
    
plt.legend(loc='lower right')
plt.xlabel("Input")
plt.ylabel("Response")
plt.show()