In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

## Data gathering and Train-Test-split

In [2]:
## Load the data set 
df = pd.read_csv("beam_data_1.csv")

In [3]:
(df.head())

Unnamed: 0,Elastic_Modulus (GPa),Load (kN),Length_of_Beam (m),X-sectional Width b (mm),X-sectional Height h (mm),Max Deflection (mm)
0,151,2,11,58,95,88.62827
1,158,3,7,61,88,39.16701
2,158,4,5,36,91,29.1626
3,141,1,15,31,84,325.683
4,177,4,13,62,106,168.0922


In [4]:
df.shape

(10000, 6)

In [5]:
type(df.shape)

tuple

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Elastic_Modulus (GPa)      10000 non-null  int64  
 1   Load (kN)                  10000 non-null  int64  
 2   Length_of_Beam (m)         10000 non-null  int64  
 3   X-sectional Width b (mm)   10000 non-null  int64  
 4   X-sectional Height h (mm)  10000 non-null  int64  
 5   Max Deflection (mm)        10000 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 468.9 KB


In [7]:
df.isnull().sum()

Elastic_Modulus (GPa)        0
Load (kN)                    0
Length_of_Beam (m)           0
X-sectional Width b (mm)     0
X-sectional Height h (mm)    0
Max Deflection (mm)          0
dtype: int64

In [8]:
## Separate the input and output variables

X = df.drop("Max Deflection (mm)",axis = 1)
y = df[["Max Deflection (mm)"]]

In [9]:
X.head()

Unnamed: 0,Elastic_Modulus (GPa),Load (kN),Length_of_Beam (m),X-sectional Width b (mm),X-sectional Height h (mm)
0,151,2,11,58,95
1,158,3,7,61,88
2,158,4,5,36,91
3,141,1,15,31,84
4,177,4,13,62,106


In [10]:
y.head()

Unnamed: 0,Max Deflection (mm)
0,88.62827
1,39.16701
2,29.1626
3,325.683
4,168.0922


In [11]:
## Gathering data for Training and testing

## I am splitting in the ratio 75:25

## There are 10000 rows in the data set 

## rows for training 7500 and rows for testing -- 2500

train_X = X.iloc[0:7500,:] 
test_X = X.iloc[7500:10000,:]

train_y = y.iloc[0:7500,:]
test_y = y.iloc[7500:10000,:]

In [12]:
train_X.shape

(7500, 5)

In [13]:
train_y.shape

(7500, 1)

In [14]:
test_X.shape

(2500, 5)

In [15]:
test_y.shape

(2500, 1)

In [16]:
## Converting the dataframe to numpy Array

train_X_arr = train_X.to_numpy()
train_y_arr = train_y.to_numpy()
test_X_arr = test_X.to_numpy()
test_y_arr = test_y.to_numpy()

In [17]:
print(type(X),type(y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [18]:
train_X_arr.shape

(7500, 5)

In [19]:
train_y_arr.shape

(7500, 1)

In [20]:
test_X_arr.shape

(2500, 5)

In [21]:
test_y_arr.shape

(2500, 1)

In [22]:
df2 = pd.read_csv("Advertising.csv")
df2.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [23]:
## Function for loading the data and train test split

def data_preprocessing(csv_file,train_size,test_size,The_target_variable):
    
    # load the data using pandas
    
    df = pd.read_csv(csv_file)
    
    print(f"The shape of your dataset is : {df.shape}")
    
    total_no_of_rows = df.shape[0]
    total_no_of_columns = df.shape[1]
    
    # The input features(dependent features) & The target variable(independent feature)
    
    X = df.drop(The_target_variable,axis = 1)
    y = df[[The_target_variable]]
    
    # Train_test_split
    
    no_of_rows_for_training = total_no_of_rows*(train_size/100)
    no_of_rows_for_testing = total_no_of_rows*(test_size/100)
    
    train_X = X.iloc[0:int(no_of_rows_for_training),:] 
    test_X = X.iloc[int(no_of_rows_for_training):total_no_of_rows,:]
 
    train_y = y.iloc[1:int(no_of_rows_for_training),:]
    test_y = y.iloc[int(no_of_rows_for_training):total_no_of_rows,:]
    
    print(f"The shape of your traing data set is: {train_X.shape}")
    print(f"The shape of your testing data set is : {test_X.shape}")
    
    ## Convert the pandas dataframe to to Numpy array to do model building
    
    train_X_arr = train_X.to_numpy()
    train_y_arr = train_y.to_numpy()
    test_X_arr = test_X.to_numpy()
    test_y_arr = test_y.to_numpy()
    
    print("your dataframe has been splitted to a traing and testing dataset as well as it has been converted to numpy array for furthe preprocessing")
    
    return train_X_arr, train_y_arr, test_X_arr, test_y_arr

In [24]:
train_X_arr_1, train_y_arr_1, test_X_arr_1, test_y_arr_1 = data_preprocessing("Advertising.csv",75,25,"sales")

The shape of your dataset is : (200, 4)
The shape of your traing data set is: (150, 3)
The shape of your testing data set is : (50, 3)
your dataframe has been splitted to a traing and testing dataset as well as it has been converted to numpy array for furthe preprocessing


## Model Building

In [25]:
## Using the basis function --- phi(x) = x (Linear basis function)

In [26]:
# def linear_regression_1(X_given,y,learning_rate,eps):
    
#     X = np.insert(X_given, 0, 1, axis=1)
    
#     # calculate shape of your feature matrix(X)
    
#     feature_matrix_shape = X.shape
#     no_of_rows_in_feature_matrix = feature_matrix_shape[0]
#     no_of_columns_in_feature_matrix = feature_matrix_shape[1]
    
#     ## Intialise a weight matrix with a shape (no_of_rows_in_feature_matrix by 1)
    
#     curr_weights = np.ones((no_of_columns_in_feature_matrix,1),dtype = float)
    
#     error = 100.0
    
#     while np.any(error>eps):
        
#         for i in range (1):
            
#             operation_matrix_scalar = np.dot((np.transpose(curr_weights)),X[i,:]) - y[i,:]
#             X[i,:] = X[i,:] + operation_matrix_scalar*X[i,:]
        
#         next_weights = curr_weights - learning_rate*X[i,:]
        
#         er = next_weights - curr_weights
        
#         error = np.abs(er[0,:])
        
        
        
#     print(error)
#     return curr_weights
        
    
    
    


In [27]:
def computing_cost_fn(feature_matrix, target_var, weight_matrix):
    
    ones_column = np.ones((feature_matrix.shape[0], 1))
    X = np.hstack((ones_column, feature_matrix))
    y = target_var

    no_of_samples = X.shape[0]
    
    summation = 0.0;
    
    
    for x,y in zip(X, y):
        y_hat = np.dot(x,weight_matrix)
        summation += (y_hat - y) ** 2
        
    final_cost = summation / (no_of_samples * 2.0)

    return final_cost[0]

In [28]:
# weights = np.zeros((train_X_arr.shape[1] + 1,train_y_arr.shape[1]))
weights = np.ones(train_X_arr.shape[1] + 1,).reshape(1,-1).T * 0.6

In [29]:
weights

array([[0.6],
       [0.6],
       [0.6],
       [0.6],
       [0.6],
       [0.6]])

In [30]:
weights.shape

(6, 1)

In [31]:
train_y_arr[0][0]

88.62827

In [32]:
computing_cost_fn(train_X_arr,train_y_arr,weights)

23137.87005505771

In [33]:
computing_cost_fn(train_X_arr,train_y_arr,weights)

23137.87005505771

In [34]:
38285.52409818143

38285.52409818143

In [35]:
def gradient_descent(feature_matrix, target_var, weight_matrix,learning_rate,max_iter):
    
    iteration = 0
    
    cost = np.zeros(((max_iter),1))
    
    ones_column = np.ones((feature_matrix.shape[0], 1))
    
    X = np.hstack((ones_column, feature_matrix))
    
    no_of_samples = X.shape[0]
    
    
    while iteration < max_iter:
        
        gradient = np.zeros((X.shape[1],)).reshape(1,-1).T
        
        cost[iteration] = computing_cost_fn(feature_matrix, target_var, weight_matrix)
        
        for x,y in zip(X, target_var):
            
            y_hat = np.dot(x,weight_matrix)
            
            gradient = gradient + ((y_hat[0] - y[0]) * x.reshape(1,-1).T)  # gradient calculation
            
        
               
        weight_matrix -= (learning_rate * (gradient)/no_of_samples) # updation of weight matrix
        
#         print("Iteration:", iteration, "Cost:", cost[iteration], "Weights:", weight_matrix)
        
        if iteration > 0 and np.abs(cost[iteration - 1] - cost[iteration]) < 1e-4:
            
            print("cost convergence")
            
            break
        
        iteration = iteration + 1
        
        
    return cost , weight_matrix 
    

In [36]:
# def gradient_descent_1(feature_matrix, target_var, weight_matrix,learning_rate,max_iter):
    
#     iteration = 0
    
#     cost = np.zeros(((max_iter),1))
    
#     ones_column = np.ones((feature_matrix.shape[0], 1))
    
#     X = np.hstack((ones_column, feature_matrix))
    
#     no_of_samples = X.shape[0]
    
    
#     while iteration < max_iter:
        
#         gradient = np.zeros((X.shape[1],))
        
#         cost[iteration] = computing_cost_fn(feature_matrix, target_var, weight_matrix)
        
#         for x,y in zip(X, target_var):
            
#             y_hat = np.dot(x,weight_matrix)
            
#             gradient = gradient + ((y_hat - y) * x)  # gradient calculation
               
#             weight_matrix = weight_matrix - (learning_rate * (gradient/no_of_samples)) # updation of weight matrix
        
#         iteration = iteration + 1
        
        
#     return cost , weight_matrix 

In [37]:
# gradient_descent(train_X_arr,train_y_arr,weights,0.00001,100)

In [38]:
# gradient_descent(train_X_arr,train_y_arr,weights,0.00001,50000)

In [39]:
# shape
8283.90587649 - 8283.97519692

-0.06932043000051635

In [40]:
def linear_Regression(testing_features,testing_target_var,feature_matrix, target_var, weight_matrix,learning_rate,max_iter):
    
    ## compute the optimum weights using gradient_descent algorithm
    
    cost_matrix , updated_weight_matrix = gradient_descent(feature_matrix, target_var, weight_matrix,learning_rate,max_iter)
    
    
    ones_column = np.ones((testing_features.shape[0], 1))
    
    X_test = np.hstack((ones_column, testing_features))
    
    no_of_samples = X_test.shape[0]
    
    y_test = testing_target_var
    
    
    i = 0
    
    y_pred = np.dot(X_test,updated_weight_matrix)
        
    return y_pred
    
    

In [41]:
y_pred = linear_Regression(test_X_arr,test_y_arr,train_X_arr,train_y_arr, weights,0.00001,50000)

In [42]:
y_pred

array([[246.06297645],
       [230.44701615],
       [268.18958792],
       ...,
       [238.0806344 ],
       [257.05300785],
       [259.09440939]])

In [43]:
test_y_arr

array([[ 73.27115],
       [176.3991 ],
       [180.02786],
       ...,
       [127.56236],
       [169.53879],
       [218.56023]])

In [44]:
# y_pred