# 1.4. AirBnB Project Template

## Imports

In [None]:
import numpy as np # conda install numpy
import tensorflow as tf
import matplotlib.pyplot as plt # conda install matplotlib
import pandas as pd # conda install pandas
import warnings
import seaborn as sns # conda install seaborn - Python data visualization library based on matplotlib

warnings.filterwarnings('ignore')

print(tf.__version__)

## Load data and take a look at it

In [None]:
data = pd.read_csv('airbnb new york.csv').sample(frac=1) # returns a random sample of the whole dataframe (frac=1)
print('\n*** Data head\n')
data.head()
print('\n*** Data describe\n')
data.describe()


## Data preprocessing

In [None]:
features = data[['neighbourhood_group', 'room_type', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']]
print('*** Columns\n', features.head())
# the above print display that neighbourhood and room_type have categorical values

print('\n*** Missing values\n', features.isna().sum()) 
# isna() return a boolean same-sized object indicating if the values are NA - None or numpy.NaN - and they get mapped to True values.
# the above features.isna() shows that 'reviews_per_month' has several thousands od NA (missing) values.

features['reviews_per_month'] = features['reviews_per_month'].fillna(0) # fill NA missing values with 0s
print('\n*** Cleaned data\n', features.isna().sum()) # it shows data is cleaned

# get onehot encoding with pd.get_dummies()
onehot_neighborhood_group = pd.get_dummies(features['neighbourhood_group'])
onehot_room_type = pd.get_dummies(features['room_type'])
print('\n*** onehot encoded data\n', onehot_neighborhood_group)
print(onehot_room_type)

features = features.drop(columns=['neighbourhood_group', 'room_type']) # drop columns with categorical data
features = pd.concat([features, onehot_neighborhood_group, onehot_room_type], axis=1) # concatenate dataframe with onehot encoded columns
print('\n*** Processed data\n', features.head()) # observe updated features 

targets = data['price'] # get the targets

train_size = int(0.7 * len(data)) # 70% od data will be used for training purposes

# gets 70% of rows with all columns for X_train, and the remaining 30% of rows with all columns 
X_train, X_test = features.values[:train_size, :], features.values[train_size:, :]
y_train, y_test = targets.values[:train_size], targets.values[train_size:]
print('\nTotal number of columns\n', len(X_train[0]))

## Data visualization and analysis

## The Tensorflow 2 Machine Learning Approaches
### Linear Regression
#### Model definition

In [None]:
# reused the LinearModel function from the Linear regression notebook
class LinearModel:
    def __init__(self):
        # y_pred = W*X + b
        
        # self.W = tf.Variable(13.0)
        # self.b = tf.Variable(4.0)
        
        # initializers define the way to set the initial random weights (note plural here) of Keras layers
        # we are moving from one variable to a multiple variable (13) linear regression model
        self.initializer = tf.keras.initializers.GlorotUniform() 

    # loss function
    def loss(self, y, y_pred):
        # calculates the mean absolute error instead of mean squared error
        # MAE treats all errors equally, minimizing the impact of outliers on the loss function.
        # MSE provides faster convergence BUT has susceptibility to outliers influence and  
        #     makes it less suitable for datasets containing anomalies
        # https://medium.com/@nirajan.acharya666/choosing-between-mean-squared-error-mse-and-mean-absolute-error-mae-in-regression-a-deep-dive-c16b4eeee603
        # return tf.reduce_mean(tf.square(y - y_pred))
        return tf.reduce_mean(tf.abs(y - y_pred))
        
    # train function
    def train(self, X, y, lr=0.00001, epochs=20, verbose=True):
        
        # asarray - converts the input to an array and ensures we are using numpy float32 arrays
        print('\n*** X before array\n', X)
        X = np.asarray(X, dtype=np.float32)
        print('\n*** X after array\n', X)
        
        # reshape() - Gives a new shape to an array without changing its data.
        print('\n*** y before array and reshape()\n', X)
        y = np.asarray(y, dtype=np.float32).reshape((-1, 1)) # [1,2,3,4] -> [[1],[2],[3],[4]]      
        print('\n*** y after array and reshape()\n', X)
        
        # use the initializer from the constructor above to initialize the multiple features' weights and biases
        # LEN(x[0]) - NUMBER OF FEATURES
        self.W = tf.Variable(initial_value=self.initializer(shape=(len(X[0]), 1), dtype='float32'))
        self.b = tf.Variable(initial_value=self.initializer(shape=(1,), dtype='float32'))
        
        def train_step():
            with tf.GradientTape() as t:
                current_loss = self.loss(y, self.predict(X))

            dW, db = t.gradient(current_loss, [self.W, self.b])
            self.W.assign_sub(lr * dW) # W -= lr * dW
            self.b.assign_sub(lr * db) # b -= lr * db
            
            return current_loss

        for epoch in range(epochs):
            current_loss = train_step()
            if verbose:
                print(f'Epoch {epoch}: loss: {current_loss.numpy()}') # <3 eager execution

    def predict(self, X):
        # return self.W * X + self.b  - remove due to being just one variable
        # [a, b] x [b, c]
        # X -> [n_instances, n_features] x [n_features, 1]
        return tf.matmul(X, self.W) + self.b

#### Model instantiation

In [None]:
model = LinearModel()
model.train(X_train, y_train, lr=0.00001, epochs=100)

## Conclusions