# 2A

### Importing the Libraries and the Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [None]:
df = pd.read_excel('./audi.xlsx')

### Data Understanding and Visualisation

In [None]:
df.head()

In [None]:
df.hist(figsize=(20,10), grid = False, layout=(4,4), bins = 50)

In [None]:
sns.pairplot(df, height = 2)
plt.show()

#### X becomes are features space for the dataset and Y becomes are target feature

In [None]:
X = df[['mileage','tax','mpg','engineSize']]
Y = df[['price']]

In [None]:
X.shape

In [None]:
X.head()

### Data Standardization

In [None]:
def mean(x): # np.mean(X, axis = 0)
    return np.mean(X, axis = 0)

def std(x): # np.std(X, axis = 0)
    return np.std(X, axis = 0)

def Standardize_data(X):
    return (X - mean(X))/std(X)

X_std = Standardize_data(X)

### Finding the Covariance Matrix

In [None]:
def covariance(x):
    return np.cov(x.T)

cov_mat = covariance(X_std)
cov_mat

### The Eigen Values and the Eigen Vectors of the Covariance Matrix

In [None]:
from numpy.linalg import eig

eig_vals, eig_vecs = np.linalg.eig(cov_mat)
sorted_indices = np.argsort(eig_vals)[::-1]
eig_vals = eig_vals[sorted_indices]
eig_vecs = eig_vecs[:, sorted_indices]
print('Eigenvalues \n', eig_vals)
print('Eigenvectors \n', eig_vecs)

### Chose the Principal Components
<li> By changing the value of K we can get the top K principal components

In [None]:
# Select top k eigenvectors
k = 2
W = eig_vecs[:k, :] # Projection matrix

In [None]:
eig_vals_total = sum(eig_vals)
explained_variance = [(i / eig_vals_total) for i in eig_vals]
explained_variance = np.round(explained_variance, 2)
cum_explained_variance = np.cumsum(explained_variance)

print('Explained variance: {}'.format(explained_variance))
print('Cumulative explained variance: {}'.format(cum_explained_variance))

plt.plot(np.arange(1,X.shape[1]+1), cum_explained_variance, '-o')
plt.xticks(np.arange(1,X.shape[1]+1))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance');
plt.show()



### Project the Data

In [None]:
X_proj = X_std.dot(W.T)

print(X_proj.shape)

In [None]:
print('Components:\n', W)
print('Explained variance scratch:\n', explained_variance)
print('Cumulative explained variance from scratch:\n', cum_explained_variance)

In [None]:
plt.figure(figsize=(6, 4))
plt.step(range(4), cum_explained_variance, where='mid',label='cumulative explained variance')
plt.bar(range(4), explained_variance, alpha=0.5, align='center',label='individual explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()

In [None]:
plt.scatter(X_proj[0], X_proj[ 1])
plt.xlabel('PC1'); plt.xticks([])
plt.ylabel('PC2'); plt.yticks([])
plt.title('2 components, captures {}% of total variation'.format(cum_explained_variance[1].round(4)))
plt.show()
plt.show()

In [None]:
g = sns.pairplot(X)
axes = g.axes
for i in range(X.shape[1]):
    for j in range(X.shape[1]):
        if i != j:
            pc = eig_vecs[i]
            pc_scaled = np.sqrt(eig_vals[i]) * pc
            axes[i, j].quiver(0, 0, pc_scaled[0], pc_scaled[1], angles='xy', scale_units='xy', scale=0.1, color='r')
plt.show()

# 2B

## Importing the Libraries and the Dataset

In [None]:
df = pd.read_excel("./Hitters.xlsx")

In [None]:
df

## Dataset Understanding and Visualization

In [None]:
df.info()

### Finding the Categorical Columns

In [None]:
categorical_cols = df.select_dtypes(include = 'O')
categorical_cols

In [None]:
df.hist(figsize=(20,10), grid = False, bins = 50)

In [None]:
sns.pairplot(df)

In [None]:
#rom sklearn.preprocessing import LabelEncoder
#label_encoder=LabelEncoder()
columns_to_encode=['League','Division','NewLeague']
#for column in columns_to_encode:
#  df[column]=label_encoder.fit_transform(df[column])

df= df.drop(columns=columns_to_encode)

### Taking Care of NaN values

In [None]:
df.isna().any()

In [None]:
df['Salary'] = df['Salary'].replace(np.NaN , df['Salary'].mean())
#df=df.dropna()

In [None]:
df.describe()

In [None]:
X = df.drop(columns=['Salary'])
Y = df['Salary']
X

### Standardising the Data

In [None]:
for cols in X.columns:
  X[cols] = (X[cols] - X[cols].mean())/(X[cols].std())
X

### Data Visualization

In [None]:
df_scaled = pd.concat([X,Y] , axis = 1)

In [None]:
sns.heatmap(df_scaled.corr())

In [None]:
df_scaled

### PCA

In [None]:
def covariance(x):
    return np.cov(x.T)

cov_mat = covariance(X)

from numpy.linalg import eig

eig_vals, eig_vecs = np.linalg.eig(cov_mat)
sorted_indices = np.argsort(eig_vals)[::-1]
eig_vals = eig_vals[sorted_indices]
eig_vecs = eig_vecs[:, sorted_indices]
# print('Eigenvalues \n', eig_vals)
# print('Eigenvectors \n', eig_vecs)

# Select top k eigenvectors
k =4
W = eig_vecs[:k, :] # Projection matrix

eig_vals_total = sum(eig_vals)
explained_variance = [(i / eig_vals_total) for i in eig_vals]
explained_variance = np.round(explained_variance, 2)
cum_explained_variance = np.cumsum(explained_variance)

X_proj = X.dot(W.T) ### Projected Data
X_proj


### Linear Regression (without PCA)

In [None]:
def custom_train_test_split(X, y, test_size, random_state):
    if random_state is not None:
        random.seed(random_state)
    num_samples = len(X)
    num_test = int(test_size * num_samples)
    indices = list(range(num_samples))
    random.shuffle(indices)
    test_indices = indices[:num_test]
    train_indices = indices[num_test:]
    X_train = [X[i] for i in train_indices]
    Y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    Y_test = [y[i] for i in test_indices]
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    Y_train = np.array(Y_train)
    Y_test = np.array(Y_test)
    return X_train, X_test, Y_train, Y_test

In [None]:
X1 = X.to_numpy()
Y1 = Y.to_numpy()

In [None]:
X_train , X_test , Y_train , Y_test = custom_train_test_split(X1,Y1,test_size=0.2,random_state=2)

In [None]:
def predict_Y( bias ,weights , features):
  return bias + np.dot(features, weights)

def get_cost(Y,Y_hat):
  rmse = np.sqrt(((Y - Y_hat) ** 2).mean())
  return rmse

def update_theta(x , y , y_hat , b_0 , theta_o , learning_rate):
  grad_b = (np.sum(y_hat-y))/len(y)
  grad_w = (np.dot((y_hat-y),x))/len(y)
  b_1 = b_0 - learning_rate*grad_b
  theta_1 = theta_o - learning_rate*grad_w
  return b_1 , theta_1

In [None]:
X_train.shape,Y_train.shape,X_test.shape,Y_test.shape

In [None]:
def run_gradient_descent(X,Y,alpha,num_iterations):
  b=random.random()
  theta=np.random.rand(X.shape[1])
  J = []
  for each_iter in range(num_iterations):

    Y_hat = predict_Y(b,theta,X)
    prev_b = b
    prev_theta = theta
    b,theta = update_theta(X,Y,Y_hat,prev_b,prev_theta,alpha)
    J.append(get_cost(Y,Y_hat))


  print("Final Estimate of b and theta : ",b,theta)
  return b,theta,J

In [None]:
plt.figure(figsize=(10,6))

b,theta,cost_curve=run_gradient_descent(X_train , Y_train , alpha=0.01 , num_iterations=500)
y_test_pred = predict_Y(b , theta , X_test)
y_test_error= get_cost(Y_test,y_test_pred)
plt.plot(cost_curve , label = "Epochs")

plt.title("Regression without PCA")
plt.legend()
print('\n')
print(f'Minimum Cost Function Value: {min(cost_curve)}')
print(f'Test errors: {y_test_error}')

## With PCA

In [None]:
test_errors = []
bs= []
thetas = []
for k in range(1,17):
  W = eig_vecs[:k, :]
  X_pca = X.dot(W.T) ### Projected Data
  X1 = X_pca.to_numpy()
  Y1 = Y.to_numpy()
  X_train , X_test , Y_train , Y_test = custom_train_test_split(X1,Y1,test_size=0.2,random_state=2)

  b,theta,cost_curve=run_gradient_descent(X_train , Y_train , alpha=0.01 , num_iterations=500)

  y_test_pred = predict_Y(b , theta , X_test)
  y_test_error= get_cost(Y_test,y_test_pred)
  test_errors.append(get_cost(Y_test , y_test_pred))
  bs.append(b)
  thetas.append(theta)
  plt.plot(cost_curve , label ="PCA with " + str(k)+ " features")

plt.title("Regression with PCA")
plt.xlabel('epochs')
plt.ylabel('Testing error')
plt.legend()
print('\n')
print(f'Minimum Cost Function Value: {min(cost_curve)}')
print(f'Test errors: {y_test_error}')
 # y_test_pred = predict_Y(b , theta , Xi_test)

In [None]:
test_error_nopca=[]
for i in range (1,17):
  test_error_nopca.append(y_test_error)

In [None]:
plt.plot(test_error_nopca , label = "Without PCA")
plt.plot(test_errors , label = "PCA")
plt.xlabel("No. of principal components")
plt.ylabel("Test Errors")
plt.legend()

### Looking at the Graph we can observe that the model that gives the best results would be the one with 4 principal components
Therefore the Best estimate of weights and bias would be:

In [None]:
print(f'The best estimate of Bias b : {bs[3]}')
print(f'The best estimate of Weights Thetas : {thetas[3]}')

In [None]:
bs[3] , X_proj.shape

In [None]:
bs[3].shape , thetas[3].shape

### Visualization of the Model with the most optimum parameters

In [None]:
predictions = predict_Y(bs[3] , thetas[3] , X_proj)
plt.scatter(np.arange(1 , 323) , predictions , c = 'green' , label = 'Predictions')
plt.scatter(np.arange(1 , 323) , Y , c = 'red' , label = 'Actual')
plt.title("Projections of Predicted data vs Actual Data")
plt.legend()

In [None]:
print("         Predictions    Actual Data")
np.stack((predictions , Y), axis=1)