In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
tqdm.pandas()
sns.set_style("darkgrid")

In [2]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [3]:
ls

[0m[01;34mgdrive[0m/  [01;34msample_data[0m/


In [4]:
cd gdrive/MyDrive/FoDS_Assignment2/

/content/gdrive/MyDrive/FoDS_Assignment2


In [5]:
df = pd.read_csv("FoDS-Assignment-2.csv")
df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,price
0,4,1.75,2120.0,7420,1.0,0,0,4,7,1060.0,1060,1540,7420,453000.0
1,4,2.5,2180.0,9861,2.0,0,2,3,8,2180.0,0,2390,9761,480000.0
2,3,1.5,1540.0,9800,1.0,0,0,3,7,1010.0,530,1600,8250,180500.0
3,3,3.5,2380.0,6250,2.0,0,3,3,8,1670.0,710,2540,4010,495000.0
4,4,2.5,2230.0,8500,2.0,0,0,3,8,2230.0,0,2270,8770,325000.0


In [6]:
df.isna().sum()

bedrooms          0
bathrooms         0
sqft_living      14
sqft_lot          0
floors           13
waterfront        0
view              0
condition         0
grade             0
sqft_above       14
sqft_basement     0
sqft_living15     0
sqft_lot15        0
price             0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [None]:
sns.heatmap(data = df.corr())

In [None]:
fig = plt.figure(figsize =(40,20))
i=1
for s in df.columns:
    plt.subplot(3,5, i)
    plt.title(s,fontsize=25)
    sns.boxplot(data=df[[s]])
    i=i+1
plt.grid()

In [None]:
fig = plt.figure(figsize=(40,40))
plt.rc('axes', labelsize=25)
for i, col in enumerate(df.columns[:-1]):
    plt.subplot(4,4,i+1)
    plt.scatter(df[col], df["price"])
    plt.xlabel(col)
    plt.ylabel("price")
plt.show()

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors


def preprocess(data, f, scalingMethod="normalize", outlierMethod="knn", missingValues="drop"):
    EPS = 1e-12

    if missingValues == "drop":
        data = data.dropna()
    
    if outlierMethod == "knn":
        scaled_data = data.iloc[:,:-1]
        scaled_data = (scaled_data - scaled_data.mean())/scaled_data.std()
        nbrs = NearestNeighbors()
        nbrs.fit(scaled_data)
        distances, indexes = nbrs.kneighbors(scaled_data)
        outlier_index = np.where(distances.mean(axis = 1) > 1)
        #print(outlier_index.shape)
        # print(outlier_index)
        # print(data.index[outlier_index[0]])
        data = data.drop(data.index[outlier_index[0]])

    train = data.sample(frac=f, random_state=0)
    test = data.drop(train.index)    
    X_train = train.iloc[:, :-1]
    y_train = train.iloc[:, -1]
    X_test = test.iloc[:, :-1]
    y_test = test.iloc[:, -1]

    if scalingMethod == "normalize":
        mean = X_train.mean()
        sd = X_train.std()
        X_train = (X_train - mean) / (sd + EPS)
        X_test = (X_test - mean) / (sd + EPS)
        # mean_y = y_train.mean()
        # sd_y = y_train.std()
        # y_train = (y_train-mean_y)/(sd_y + EPS)
        # y_test = (y_test-mean_y)/(sd_y + EPS)
        return np.array(X_train), np.array(y_train).reshape(-1,1), np.array(X_test), np.array(y_test).reshape(-1,1)
    elif scalingMethod == "scale":
        minval = X_train.min()
        diff = X_train.max() - X_train.min()

        X_train = (X_train - minval) / (diff + EPS)
        X_test = (X_test - minval) / (diff + EPS)
        return np.array(X_train), np.array(y_train).reshape(-1,1), np.array(X_test), np.array(y_test).reshape(-1,1)

In [16]:
class KNN:

  def __init__(self,X):
    self.X = X

  def euclidean_distance(self,point1, point2): 
    return np.sqrt(np.sum(np.power(point1-point2,2)))

  def outlier_indices(self,k,threshold):   
    dist = []
    for i in range(0,self.X.shape[0]):
      distance = np.zeros(self.X.shape[0])
      for j in range(0,self.X.shape[0]):
        distance[j] = (self.euclidean_distance(self.X[i],self.X[j]))  
      dist.append(np.mean(distance[np.argsort(-1*distance)[:k]]))  

    return(np.argwhere(dist>=threshold)) 

IndentationError: ignored

In [None]:
class LinearRegression:

    def __init__(self, X, y):
            self.X = X #(n,d)
            self.y = y #(n,1)
            self.W = np.zeros((1, X.shape[1])) #(1,d)
            self.b = 1

    def fitGD(self, n_iter=1000, lr=0.01, graph=False):
        mse = []
        for i in tqdm(range(n_iter)):
            yp = self.predict(self.X)
            mse.append(self.MSE(self.y, yp))
            # print(mse[-1])
            grad = (1/self.X.shape[0])*((self.X.T)@(yp-self.y)).T #(d,n)*(n,1) = (d,1)
            self.W = self.W - lr * grad
            self.b = self.b - lr * np.mean(yp-self.y)
        if(graph == True):
            plt.plot(mse)
            plt.show()
        return self.W, self.b
    
    def predict(self, X):
        return (X@self.W.T + self.b)

    def MSE(self, y, yp):
        return 0.5*np.mean((y-yp)**2)
    
    def RMSE(self, y, yp):
        return np.sqrt(self.MSE(y, yp))


In [None]:
X_train, y_train, X_test, y_test = preprocess(df, 0.75)
model = LinearRegression(X_train, y_train)
model.fitGD(lr=0.1, graph=True)

In [None]:
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_absolute_percentage_error

def GFFS(X_train, y_train, X_test, y_test):
    d = X_train.shape[1]
    remaining = list(range(d))
    selected = []
    best_n_feature_MSE = []
    for i in range(d):
        min_mse = 1e100
        min_j = -1
        max_r2 = 0
        for j in remaining:
            cur_features = selected + [j]
            X_train_cur = X_train[:,cur_features]
            X_test_cur = X_test[:, cur_features]
            model = LR()
            model.fit(X_train_cur, y_train)
            mse = mean_absolute_percentage_error(y_test, model.predict(X_test_cur))
            #model = LinearRegression(X_train_cur, y_train)
            #model.fitGD(lr=0.1,n_iter=10000)
            #mse = model.MSE(y_test,model.predict(X_test_cur))
            if mse < min_mse:
                min_mse = mse
                min_j = j
                max_r2 = r2_score(y_test, model.predict(X_test_cur))
        best_n_feature_MSE.append(min_mse)
        selected.append(min_j)
        remaining.remove(min_j)
        print((min_mse, min_j, max_r2))
    return selected, best_n_feature_MSE



In [None]:
X_train, y_train, X_test, y_test = preprocess(df, 0.75)
selected, best_n_feature_MSE = GFFS(X_train, y_train, X_test, y_test)
print(selected)
np.array(df.columns)[selected]

In [None]:
#from sklearn.linear_model import LinearRegression as LR
#from sklearn.metrics import mean_squared_error

def GBFS(X_train, y_train, X_test, y_test):
    d = X_train.shape[1]
    removed = [-1]
    selected = list(range(d))
    model = LR()
    model.fit(X_train, y_train)
    best_n_feature_MSE = [mean_squared_error(y_test, model.predict(X_test))]
    for i in range(d-1):
        min_mse = 1e100
        min_j = -1
        for j in selected:
            cur_features = [x for x in selected if x != j]
            X_train_cur = X_train[:, cur_features]
            X_test_cur = X_test[:, cur_features]
            model = LR()
            model.fit(X_train_cur, y_train)
            mse = mean_squared_error(y_test, model.predict(X_test_cur))
            if mse < min_mse:
                min_mse = mse
                min_j = j
        best_n_feature_MSE.append(min_mse)
        selected.remove(min_j)
        removed.append(min_j)
        print((min_mse, min_j))
    removed.append(selected[0])
    return removed, best_n_feature_MSE

In [None]:
X_train, y_train, X_test, y_test = preprocess(df, 0.75)
selected, best_n_feature_MSE = GFFS(X_train, y_train, X_test, y_test)
print(selected)
np.array(df.columns)[selected]


In [None]:
X_train, y_train, X_test, y_test = train_test_split(df, 0.75)
removed, best_n_feature_MSE = GBFS(X_train, y_train, X_test, y_test)
print(removed)
np.array(df.columns)[removed[1:]]

In [None]:
f_list = np.arange(0,13,1)
f = []
for i in range(1,14):
    f.append(f_list)
    f_list = f_list[f_list != removed[i]]
print("Feature Subsets (GBFS)")
for i in range(len(f)):
    print("Features",np.array(df.columns)[f[i]])
    print("MSE",best_n_feature_MSE[i])

In [None]:
fig = plt.figure(figsize=(10,10))
sns.lineplot(x=(df.columns)[removed[1:]],y=best_n_feature_MSE,marker='o')
plt.xticks(rotation=90)
plt.show()