In [34]:
#!/usr/bin/env python3

import pandas as pd
import numpy as np
import math
from sklearn.linear_model import LinearRegression
from sklearn import metrics


train_fname = "../csv/salary_full_train.csv"
test_fname = "../csv/salary_full_test.csv"

train_dataset = pd.read_csv(train_fname, header=0)
print(f'Shape of the train data {train_dataset.shape}')
# print the first 5 rows of the dataset
print(train_dataset.head(5))
print('\n\n')
test_dataset = pd.read_csv(test_fname, header=0)
print(f'Shape of the test data {test_dataset.shape}')

# The features used to build matrix X
# feature_cols = ["ry", "yd", "sex", "asoc", "full", "phd"]
# feature_cols = ["sex"]
feature_cols = ["asoc", "full", "ry"]
# the target feature (vector y) of the regression
target = "salary"

X = train_dataset[feature_cols]
y = train_dataset[target]

# print(X.head(5))
# print(y.head(5))

regressor = LinearRegression()
regressor.fit(X, y)

print(f'\tintercept = {regressor.intercept_}')
print(f'\tcoefficient = {regressor.coef_}')

tX = test_dataset[feature_cols]
ty = test_dataset[target]

y_pred = regressor.predict(tX)

print("\n")
y_test = np.array(ty)
X_test = np.array(tX) 

test_dataset["y_pred"] = y_pred
f_mean = test_dataset[test_dataset["sex"] == 0]["y_pred"].mean()
m_mean = test_dataset[test_dataset["sex"] == 1]["y_pred"].mean()
print("f_mean : " + str(f_mean))
print("m_mean : " + str(m_mean))

print(regressor.coef_)
# i = 0
# for a, b in zip(y_test, y_pred):
#     sex = test_dataset['sex'].iloc[i]
#     print(f'  true value: {a} \t predicted value: {b} \t {sex}')
#     i += 1

# print('\n\n')
# print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
# print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# print('\n')


Shape of the train data (36, 7)
   salary  ry  yd  sex  asoc  full  phd
0   28200  10  23    1     0     1    1
1   18075   3   4    1     0     0    1
2   33696  19  30    1     0     1    0
3   20525   8  31    1     1     0    0
4   18000   3  11    1     0     0    0



Shape of the test data (16, 7)
	intercept = 15993.519137775615
	coefficient = [3906.6927383  9313.3643144   397.55544627]


f_mean : 17119.92623555448
m_mean : 23530.649097614947
[3906.6927383  9313.3643144   397.55544627]


In [37]:
#!/usr/bin/env python3

import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics


# -------------------------------------------------------------------------------

def normalize(train_data, test_data, col_class, method='mean_std'):
    """
    Normalizes all the features by linear transformation *except* for the target class specified as `col_class`.
    Two normalization methods are implemented:
      -- `mean_std` shifts by the mean and divides by the standard deviation
      -- `maxmin` shifts by the min and divides by the difference between max and min
      *Note*: mean/std/max/min are computed on the training data
    The function returns a pair normalized_train, normalized_test. For example,
    if you had `train` and `test` pandas DataFrames with the class stored in column `Col`, you can do

        train_norm, test_norm = normalize(train, test, 'Col')

    to get the normalized `train_norm` and `test_norm`.
    """
    # removing the class column so that it is not scaled
    no_class_train = train_data.drop(col_class, axis=1)
    no_class_test = test_data.drop(col_class, axis=1)

    # scaling
    normalized_train, normalized_test = None, None
    if method == 'mean_std':
        normalized_train = (no_class_train - no_class_train.mean()) / no_class_train.std()
        normalized_test = (no_class_test - no_class_train.mean()) / no_class_train.std()
    elif method == 'maxmin':
        normalized_train = (no_class_train - no_class_train.min()) / (no_class_train.max() - no_class_train.min())
        normalized_test = (no_class_test - no_class_train.min()) / (no_class_train.max() - no_class_train.min())
    else:
        raise f"Unknown method {method}"

    # gluing back the class column and returning
    return pd.concat([train_data[col_class], normalized_train], axis=1), pd.concat([test_data[col_class], normalized_test], axis=1)

# -------------------------------------------------------------------------------

tr_dat = pd.read_csv("../csv/maisons_original_train.csv", header=0)
print(f'Shape of the train data {tr_dat.shape}')
# print the first 5 rows from the dataset
print(tr_dat.head(1))


print('\n\n')
te_dat = pd.read_csv("../csv/maisons_original_test.csv", header=0)
print(f'Shape of the test data {te_dat.shape}')

train_dataset, test_dataset = tr_dat, te_dat
train_dataset_norm, test_dataset_norm = normalize(tr_dat, te_dat, "price")

def fit(train_dataset):
    X = train_dataset.drop("price", axis=1)
    y = train_dataset["price"]
    regressor = KNeighborsRegressor(n_neighbors=1, algorithm='kd_tree', weights='distance')
    regressor.fit(X, y)
    return regressor

regressor = fit(train_dataset)
regressor_norm = fit(train_dataset_norm)

tX, tX_norm = test_dataset.drop("price", axis=1), test_dataset_norm.drop("price", axis=1)
ty, ty_norm = test_dataset["price"], test_dataset_norm["price"]

y_pred = regressor.predict(tX)
y_pred_norm = regressor_norm.predict(tX_norm)

print("\n")
y_test, y_test_norm = np.array(ty), np.array(ty_norm)
X_test, X_test_norm = np.array(tX), np.array(tX_norm)

# for a, b in zip(y_test, y_pred):
#     print(f'  true value: {a} \t predicted value: {b}')

print("# original:")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2 Score:', metrics.r2_score(y_test, y_pred))
print('\n')

print("#normalized:")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_norm, y_pred_norm))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_norm, y_pred_norm))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_norm, y_pred_norm)))
print('R2 Score:', metrics.r2_score(y_test_norm, y_pred_norm))
print('\n')


Shape of the train data (33, 8)
   price  sqft  age  feats  ne  cust  cor  tax
0    699  1400   45      1   0     1    1  481



Shape of the test data (33, 8)


# original:
Mean Absolute Error: 160.3030303030303
Mean Squared Error: 72206.78787878787
Root Mean Squared Error: 268.71320748855624
R2 Score: 0.6115929277395976


#normalized:
Mean Absolute Error: 179.84848484848484
Mean Squared Error: 58988.454545454544
Root Mean Squared Error: 242.87538892496815
R2 Score: 0.682695580287729


