# Neural Network Classifiers

In [2]:
import pandas as pd

# Import data
train = pd.read_csv("../data/train.csv").sort_values('id')
train_df = train.loc[:, train.columns != 'price']
train_prices = train['price'].values

test_df = pd.read_csv("../data/test.csv").sort_values('id')

In [4]:
from data_processing import preprocessing

# Combine data for processing (important for one-hot encoding)
combined_df = pd.concat([test_df, train_df], ignore_index=True)

# Select which columns to include in the analysis
selected_features = list(combined_df.columns)
to_remove = ['id', 'scrape_id', 'last_scraped', 'name', 'description', 
             'picture_url', 'host_id', 'host_name', 'calendar_last_scraped',
             'bathrooms_text']
for col in to_remove:
  selected_features.remove(col)

# Process combined data
processed_data = preprocessing(combined_df.copy(), selected_features)

test_processed = processed_data.iloc[:len(test_df)]
train_processed = processed_data.iloc[len(test_df):]

### Limited features testing

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Select 7 prominent features to train on
selected_features = ['latitude', 'longitude', 'accommodates', 
                     'amenities', 'availability_365', 
                     'number_of_reviews_ltm', 'host_since']

X_train = train_processed[selected_features].copy()
y_train = tf.keras.utils.to_categorical(train_prices, num_classes=6)

# Build the neural network model
model = Sequential()
model.add(Dense(10, input_dim=7, activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(6, activation='softmax'))  # Output layer with softmax activation for classification

# Compile the model
model.compile(loss='categorical_crossentropy',  # Use categorical crossentropy for multi-class classification
              optimizer='adam',
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

2023-12-05 18:45:16.298378: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10


2023-12-05 18:45:37.923716: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff3650d57c0>

### Larger model

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Select 7 prominent features to train on
selected_features = ['latitude', 'longitude', 'accommodates', 
                     'amenities', 'availability_365', 
                     'number_of_reviews_ltm', 'host_since', 'host_listings_count',
                     'host_is_superhost', 'host_identity_verified', 
                     'neighbourhood_cleansed', 'room_type', 'has_availability',
                     'availability_60', 'calculated_host_listings_count']
X_train = train_processed[selected_features].copy()
# Convert output data to one-hot encoding
y_train = tf.keras.utils.to_categorical(train_prices, num_classes=6)

# Build the neural network model
model = Sequential()
model.add(Dense(32, input_dim=15, activation='sigmoid'))
model.add(Dense(16, activation='sigmoid'))
model.add(Dense(12, activation='sigmoid'))
model.add(Dense(8, activation='sigmoid'))
model.add(Dense(6, activation='softmax'))  # Output layer with softmax activation for classification

# Compile the model
model.compile(loss='categorical_crossentropy',  # Use categorical crossentropy for multi-class classification
              optimizer='adam',
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=15, batch_size=32)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fdbe45a8ee0>

### Attempts to include images and descriptions

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, concatenate, Embedding, Flatten
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input, decode_predictions
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import requests
from io import BytesIO
from PIL import Image
import tqdm

# Function to load and preprocess images
def load_and_preprocess_images(ids, max_samples=None):
    image_data = []
    inclusion_mask = []

    for id in tqdm.tqdm(ids[:max_samples], desc="Loading Images", unit="image"):
        try:
            img = Image.open(f'../data/test_images/{id}.jpg')
            img = img.resize((224, 224))  # Resize images to match MobileNetV2 input size
            img_array = image.img_to_array(img)
            img_array = preprocess_input(img_array)

            # Ensure the image array has the correct shape
            if img_array.shape == (224, 224, 3):
                image_data.append(img_array)
                inclusion_mask.append(True)
            else:
                inclusion_mask.append(False)
        except Exception as e:
            inclusion_mask.append(False)

    return np.array(image_data), inclusion_mask

max_samples = 1000
image_data, inclusion_mask = load_and_preprocess_images(train_df['id'], max_samples)

Loading Images: 100%|██████████| 1000/1000 [02:36<00:00,  6.38image/s]


In [16]:
# Function to tokenize and pad sequences for descriptions
def preprocess_descriptions(descriptions, max_samples=None):
    descriptions = [str(desc) for desc in descriptions[:max_samples]]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(descriptions[:max_samples])
    sequences = tokenizer.texts_to_sequences(descriptions[:max_samples])
    padded_sequences = pad_sequences(sequences)
    return padded_sequences, tokenizer

description_data, tokenizer = preprocess_descriptions(train_df['description'][:max_samples][inclusion_mask])
price_data = np.array(train_prices[:max_samples][inclusion_mask])

In [17]:
# Split the data into training and testing sets
image_train, image_test, desc_train, desc_test, price_train, price_test = train_test_split(
    image_data, description_data, price_data, test_size=0.2, random_state=42
)

# Load pre-trained MobileNetV2 model for image processing
image_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of the pre-trained image model
for layer in image_model.layers:
    layer.trainable = False

# Create the text processing branch
desc_input = Input(shape=(description_data.shape[1],))
desc_embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=description_data.shape[1])(desc_input)
desc_flatten = Flatten()(desc_embedding)

# Flatten the output of the MobileNetV2 layer
image_flatten = Flatten()(image_model.output)

# Combine the flattened image and text features
combined = concatenate([image_flatten, desc_flatten])

# Add additional layers for the final prediction
x = Dense(128, activation='relu')(combined)
output = Dense(1, activation='linear')(x)

# Create the final model
model = Model(inputs=[image_model.input, desc_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'accuracy'])

# Train the model
model.fit([np.asarray(image_train), np.asarray(desc_train)], np.asarray(price_train), epochs=10, batch_size=32, validation_data=([np.asarray(image_test), np.asarray(desc_test)], np.asarray(price_test)))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fdb6aaa47c0>