In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, confusion_matrix, balanced_accuracy_score
from sklearn.preprocessing import normalize, StandardScaler

# import plotly.graph_objects as go
# import plotly

In [2]:
df = pd.read_csv('house_data_class.csv')
df_orig= df.copy()

In [3]:
df

Unnamed: 0,price,bedrooms,sqft_living,floors,waterfront,view,condition,yr_built,grade
0,221900,3,1180,1.0,0,0,3,1955,7
1,538000,3,2570,2.0,0,0,3,1951,7
2,180000,2,770,1.0,0,0,3,1933,6
3,604000,4,1960,1.0,0,0,5,1965,7
4,510000,3,1680,1.0,0,0,3,1987,8
5,1225000,4,5420,1.0,0,0,3,2001,11
6,257500,3,1715,2.0,0,0,3,1995,7
7,291850,3,1060,1.0,0,0,3,1963,7
8,229500,3,1780,1.0,0,0,3,1960,7
9,323000,3,1890,2.0,0,0,3,2003,7


In [4]:
# df_corr = df.corr(method='spearman')
# fig = go.Figure(data=go.Heatmap(
#     z=df_corr,
#     x=df_corr.columns.tolist(),
#     y=df_corr.columns.tolist(),
#     colorscale='RdYlGn',
#     zmin=-1, zauto=False,
    
# ))
# fig.update_layout(
#     title='Correlogram',
#     xaxis_nticks=36)
# fig.show()

In [5]:
df['grade_binary'] = np.where(df['grade'] > 7, 1, 0)

In [6]:
df.grade_binary.value_counts()

0    11294
1    10319
Name: grade_binary, dtype: int64

In [7]:
df_copy= df.copy()
df = df.iloc[:100, :]

In [8]:
class LogisticRegression():
    def __init__(self, max_iter=10000, alpha=0.001, to_scale=True):
        self.max_iter = max_iter
        self.alpha = alpha
        self.to_scale = to_scale
    
    def calculate_cost(self):
        assert(np.isin(self.y, [0,1]).all())
        
        self.J = (-1/self.m)*(np.sum((self.y*np.log(self.h)) + ((1-self.y)*(np.log(1-self.h)))))
        
    def calculate_theta(self):
#         for j, _ in enumerate(self.theta):
#             self.theta[j] = self.theta[j] \
#             - (self.alpha / self.m) * np.sum(np.dot((self.h - self.y), self.X[:,j].T))
        self.theta = self.theta-(self.alpha/self.m)*(self.X.T.dot(self.h-self.y))
 
    def fit(self, X, y):
        self.X = np.array(X)
        self.y = np.array(y)
        
        # insert intercept
        self.X = np.insert(self.X, obj = 0, values = 1, axis=1)
        
        # scale
        if self.to_scale:
            self.st_scaler=StandardScaler()
            self.X = self.st_scaler.fit_transform(self.X)
        
        self.m = self.X.shape[0]
        self.n = self.X.shape[1]
        
        np.random.seed(42)
        self.theta=np.random.rand(self.n)
        self.J = 1e10
        t = np.matmul(self.X, self.theta)
        self.h = np.divide(1.,(1. + np.exp(-t)))
        for _ in range(self.max_iter):
#             print(self.h)
            self.calculate_cost()
            self.calculate_theta()
            t = np.matmul(self.X, self.theta)
            self.h = 1 / (1 + np.exp(-t))
                       
        
    def predict(self, X_test):
        self.X_test = np.array(X_test)
        self.X_test = np.insert(self.X_test, obj = 0, values = 1, axis=1)
        
        if self.to_scale:
            self.X_test = self.st_scaler.transform(self.X_test)
        t = np.matmul(self.X_test, self.theta)
        return 1 / (1 + np.exp(-t))

In [9]:
lr = LogisticRegression(to_scale=True)
lr.fit(df.drop(['grade_binary'], axis=1), df['grade_binary'])

In [10]:
lr.predict(df.drop(['grade_binary'], axis=1))

array([0.0236116 , 0.28488515, 0.00136438, 0.41902066, 0.50433196,
       0.99998266, 0.14194901, 0.0334114 , 0.03958394, 0.22092123,
       0.81613189, 0.04704864, 0.04554251, 0.11086396, 0.12036136,
       0.9916677 , 0.24243674, 0.11562309, 0.01075904, 0.04699593,
       0.10353172, 0.99991541, 0.7469575 , 0.02970552, 0.68459522,
       0.0155572 , 0.82455914, 0.50599576, 0.06035888, 0.94563176,
       0.84468236, 0.16616247, 0.38568534, 0.33085891, 0.13856785,
       0.60512576, 0.00236549, 0.69405492, 0.03468336, 0.90488221,
       0.98183612, 0.95893053, 0.99818339, 0.17445537, 0.01129666,
       0.86821943, 0.00496998, 0.98847926, 0.08169491, 0.99967575,
       0.02435208, 0.7468945 , 0.14713403, 0.16908382, 0.91328204,
       0.99411845, 0.3419542 , 0.65064938, 0.99557992, 0.71545224,
       0.82718816, 0.03187316, 0.8419462 , 0.86947693, 0.10822271,
       0.14448274, 0.99901346, 0.32271848, 0.03543351, 0.99123183,
       0.99995393, 0.0718893 , 0.36785068, 0.77971736, 0.13334

In [11]:
grades = df_orig['grade'].value_counts().index.tolist()
print(grades)
print(grades[:-1])
df_orig['grade'].value_counts()

[7, 8, 9, 6, 10, 11, 5, 12, 4, 13, 3, 1]
[7, 8, 9, 6, 10, 11, 5, 12, 4, 13, 3]


7     8981
8     6068
9     2615
6     2038
10    1134
11     399
5      242
12      90
4       29
13      13
3        3
1        1
Name: grade, dtype: int64

In [12]:
df_orig['pred_y']= np.nan

predict_idx = df_orig.index.tolist()
grades = df_orig['grade'].value_counts().index.tolist()
# grades = np.unique(df_orig['grade'])
max_iter=10000
for grade in grades[:-1]:
    max_iter -= 500
    lr = LogisticRegression(max_iter, alpha=0.0001)
    df_pred = df_orig.drop('pred_y' ,axis=1).loc[predict_idx].copy()
    df_pred['grade_binary'] = np.where(df_pred['grade'] == grade, 1, 0)
    lr.fit(df_pred.drop(['grade', 'grade_binary'], axis=1), df_pred['grade_binary'])
    df_orig.loc[predict_idx, 'pred_y'] = np.where(lr.predict(df_pred.drop(['grade', 'grade_binary'], axis=1))>0.5, grade, np.nan) 
    predict_idx = df_orig[df_orig['pred_y'].isna()].index.tolist()

df_orig['pred_y'] = df_orig['pred_y'].fillna(grades[-1])

In [13]:
df_orig['pred_y'].value_counts(dropna=False)

7.0     9952
8.0     6279
9.0     3222
6.0     1325
10.0     498
11.0     196
5.0       90
12.0      31
4.0       13
13.0       4
1.0        2
3.0        1
Name: pred_y, dtype: int64

In [14]:
pd.DataFrame(confusion_matrix(df_orig['grade'], df_orig['pred_y']), 
             columns = np.unique(df_orig['grade']),
             index = np.unique(df_orig['grade'])).rename_axis("↓ relevant", axis=0).rename_axis("→ selected", axis=1)

→ selected,1,3,4,5,6,7,8,9,10,11,12,13
↓ relevant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,0,0,0,0,0,1,0
4,1,0,3,7,6,0,0,1,2,5,4,0
5,0,1,7,19,50,31,19,38,37,25,11,4
6,0,0,2,55,461,352,238,490,290,135,15,0
7,0,0,1,8,765,3051,2780,2184,161,31,0,0
8,0,0,0,0,39,2917,2614,490,8,0,0,0
9,0,0,0,0,3,2042,552,18,0,0,0,0
10,0,0,0,0,0,1059,74,1,0,0,0,0
11,0,0,0,0,0,398,1,0,0,0,0,0


In [15]:
balanced_accuracy_score(df_orig['grade'], df_orig['pred_y'])

0.18212898503590394

In [16]:
(df_orig['grade']==df_orig['pred_y']).sum()/df.shape[0]

61.67