## Assignment 3 Airbnb

In [1]:
# Import gurobi and numpy
from gurobipy import *
import pandas as pd
import numpy as np
import csv

In [2]:
# Load training dataset and testing dataset
df_train = pd.read_csv('/Users/zhengyaojin/Desktop/AirbnbTrain.csv')
df_test = pd.read_csv('/Users/zhengyaojin/Desktop/AirbnbTest.csv')

In [12]:
df_train.head()

Unnamed: 0,latitude,longitude,Entire home,accommodates,bathrooms,bedrooms,beds,cleaning_fee,minimum_nights,number_of_reviews,review_scores_rating,instant_bookable,price
0,34.103701,-118.332241,1,13,2.0,3,2,150,2,1,100,1,350
1,34.099484,-118.331645,1,8,2.0,2,4,150,1,11,96,1,190
2,34.104321,-118.329662,1,4,1.0,0,1,55,1,1,80,0,85
3,34.101028,-118.317848,0,2,1.0,1,1,20,1,8,98,0,75
4,34.098292,-118.32498,1,2,1.0,1,1,20,1,11,96,0,130


In [5]:
# Model 1
mod = Model()

# Exclude the price column from the training data
train_data = df_train.drop(columns=['price'])

# Define number of datapoints and number of features
n = train_data.shape[0]
m = train_data.shape[1]

# Define target variable y, the price of Airbnb listing
y = df_train['price']

# Define decision variable, the coefficients and deviation
c = mod.addVars(m)

d = mod.addVars(n)

# Construct constraints, deviation
for i in range(n):
    value = sum(train_data.iloc[i, j] * c[j] for j in range(m))
    mod.addConstr((y[i] - value) <= d[i])
    mod.addConstr(-(y[i] - value) <= d[i])

# Define objective function 
mod.setObjective(sum(d[i] for i in range(n)), GRB.MINIMIZE)


mod.update()
mod.optimize()

# Get optimal coefficients
if mod.status == GRB.OPTIMAL:
    coefficients = [c[i].x for i in range(m)]



# Exclude the price column from the testing data and extract it for comparison
test_data = df_test.drop(columns=['price'])
y_test = df_test['price']
n_test = test_data.shape[0]
m_test = test_data.shape[1]


# Apply the model on test set and print out the prediction error
predict_price = [sum(test_data.iloc[i, j] * coefficients[j] for j in range(m_test)) for i in range(n_test)]
predict_error = [abs(y_test[i] - predict_price[i]) for i in range(n_test)]
mean_predict_error = sum(predict_error) / n_test

print(f"Mean Prediction Error on Test Set:$ {mean_predict_error:.4f} /night")



Gurobi Optimizer version 11.0.0 build v11.0.0rc2 (mac64[arm] - Darwin 22.6.0 22G74)

CPU model: Apple M2 Pro
Thread count: 12 physical cores, 12 logical processors, using up to 12 threads

Optimize a model with 3400 rows, 1712 columns and 41372 nonzeros
Model fingerprint: 0x858af64d
Coefficient statistics:
  Matrix range     [5e-01, 5e+02]
  Objective range  [1e+00, 1e+00]
  Bounds range     [0e+00, 0e+00]
  RHS range        [1e+01, 2e+03]
Presolve time: 0.01s
Presolved: 3400 rows, 1712 columns, 41372 nonzeros

Concurrent LP optimizer: primal simplex, dual simplex, and barrier
Showing barrier log only...

Ordering time: 0.00s

Barrier statistics:
 Dense cols : 12
 AA' NZ     : 2.995e+04
 Factor NZ  : 3.260e+04 (roughly 2 MB of memory)
 Factor Ops : 4.141e+05 (less than 1 second per iteration)
 Threads    : 1

                  Objective                Residual
Iter       Primal          Dual         Primal    Dual     Compl     Time
   0   3.62399801e+06  0.00000000e+00  1.36e+03 0.00e

In [11]:
# Model 2

mod_2 = Model()

# Exclude the price column from the training data 
train_data = df_train.drop(columns=['price'])

# Extract column names
column_names = train_data.columns

# Define number of datapoints and number of features
n = train_data.shape[0]
m = train_data.shape[1]

# Define target variable y, the price of Airbnb listing
y = df_train['price']

# Define decision variable, the coefficients, deviation, and whether the coefficient is selected
c = mod_2.addVars(m)

d = mod_2.addVars(n)

s = mod_2.addVars(m, vtype = GRB.BINARY)

# Construct constraints, deviation
for i in range(n):
    value = sum(train_data.iloc[i, j] * c[j] for j in range(m))
    mod_2.addConstr((y[i] - value) <= d[i])
    mod_2.addConstr(-(y[i] - value) <= d[i])

# Select at most three coefficients
mod_2.addConstr(sum(s[i] for i in range(m)) <= 3)

# Coefficients selected must be greater than 0 
for j in range(m):
    mod_2.addConstr((s[j] == 1) >> (c[j] <= 1e4))
    mod_2.addConstr((s[j] == 1) >> (c[j] >= -1e4))
    mod_2.addConstr((s[j] == 0) >> (c[j] == 0))


# Define objective function 
mod_2.setObjective(sum(d[i] for i in range(n)), GRB.MINIMIZE)


mod_2.update()
mod_2.optimize()

# Get optimal coefficients
if mod_2.status == GRB.OPTIMAL:
    coefficients = [c[i].x for i in range(m)]



# Exclude the price column from the testing data and extract it for comparison
test_data = df_test.drop(columns=['price'])
y_test = df_test['price']
n_test = test_data.shape[0]
m_test = test_data.shape[1]


# Apply the model on test set and print out the prediction error
predict_price = [sum(test_data.iloc[i, j] * coefficients[j] for j in range(m_test)) for i in range(n_test)]
predict_error = [abs(y_test[i] - predict_price[i]) for i in range(n_test)]
mean_predict_error = sum(predict_error) / n_test


# Get the name of the selected coefficients name and their corresponding value
for i in range(m):
    if(c[i].x > 0):
        coefficient_name = column_names[i]
        coefficient_value = c[i].x
        print(f'{coefficient_name}: {coefficient_value:.4f}')
    
    
# Print mean prediction error
print(f"Mean Prediction Error on Test Set:$ {mean_predict_error:.4f} /night")

Gurobi Optimizer version 11.0.0 build v11.0.0rc2 (mac64[arm] - Darwin 22.6.0 22G74)

CPU model: Apple M2 Pro
Thread count: 12 physical cores, 12 logical processors, using up to 12 threads

Optimize a model with 3401 rows, 1724 columns and 41384 nonzeros
Model fingerprint: 0xe40f858b
Model has 36 general constraints
Variable types: 1712 continuous, 12 integer (12 binary)
Coefficient statistics:
  Matrix range     [5e-01, 5e+02]
  Objective range  [1e+00, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [3e+00, 2e+03]
  GenCon rhs range [1e+04, 1e+04]
  GenCon coe range [1e+00, 1e+00]
Found heuristic solution: objective 3.400000e+12
Presolve removed 816 rows and 414 columns
Presolve time: 0.03s
Presolved: 2585 rows, 1310 columns, 31274 nonzeros
Found heuristic solution: objective 246446.00000
Variable types: 1298 continuous, 12 integer (12 binary)

Root relaxation: objective 6.192462e+04, 1411 iterations, 0.10 seconds (0.32 work units)

    Nodes    |    Current Node    |     

In [15]:
# Model 3

mod_3 = Model()

# Exclude the price column and bed index from the training data 
train_data = df_train.drop(columns=['price'])
bed_index = train_data.columns.get_loc("beds")

# Extract column names
column_names = train_data.columns

# Define number of datapoints and number of features
n = train_data.shape[0]
m = train_data.shape[1]

# Define target variable y, the price of Airbnb listing
y = df_train['price']

# Define decision variable, the coefficients, deviation, and whether the coefficient is selected
c = mod_3.addVars(m)

d = mod_3.addVars(n)

s = mod_3.addVars(m, vtype = GRB.BINARY)

# Construct constraints, deviation
for i in range(n):
    value = sum(train_data.iloc[i, j] * c[j] for j in range(m))
    mod_3.addConstr((y[i] - value) <= d[i])
    mod_3.addConstr(-(y[i] - value) <= d[i])

# Select at most three coefficients
mod_3.addConstr(sum(s[i] for i in range(m)) <= 3)

# Must select bed as a coefficient
mod_3.addConstr(s[bed_index] == 1)

# Coefficients selected must be greater than 0 
for j in range(m):
    mod_3.addConstr((s[j] == 1) >> (c[j] <= 1e4))
    mod_3.addConstr((s[j] == 1) >> (c[j] >= -1e4))
    mod_3.addConstr((s[j] == 0) >> (c[j] == 0))


# Define objective function 
mod_3.setObjective(sum(d[i] for i in range(n)), GRB.MINIMIZE)


mod_3.update()
mod_3.optimize()

# Get optimal coefficients
if mod_3.status == GRB.OPTIMAL:
    coefficients = [c[i].x for i in range(m)]



# Exclude the price column from the testing data and extract it for comparison
test_data = df_test.drop(columns=['price'])
y_test = df_test['price']
n_test = test_data.shape[0]
m_test = test_data.shape[1]


# Apply the model on test set and print out the prediction error
predict_price = [sum(test_data.iloc[i, j] * coefficients[j] for j in range(m_test)) for i in range(n_test)]
predict_error = [abs(y_test[i] - predict_price[i]) for i in range(n_test)]
mean_predict_error = sum(predict_error) / n_test


# Get the name of the selected coefficients name and their corresponding value
for i in range(m):
    if(c[i].x > 0):
        coefficient_name = column_names[i]
        coefficient_value = c[i].x
        print(f'{coefficient_name}: {coefficient_value:.4f}')
    
    
# Print mean prediction error
print(f"Mean Prediction Error on Test Set:$ {mean_predict_error:.4f} /night")

Gurobi Optimizer version 11.0.0 build v11.0.0rc2 (mac64[arm] - Darwin 22.6.0 22G74)

CPU model: Apple M2 Pro
Thread count: 12 physical cores, 12 logical processors, using up to 12 threads

Optimize a model with 3402 rows, 1724 columns and 41385 nonzeros
Model fingerprint: 0x158561b6
Model has 36 general constraints
Variable types: 1712 continuous, 12 integer (12 binary)
Coefficient statistics:
  Matrix range     [5e-01, 5e+02]
  Objective range  [1e+00, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 2e+03]
  GenCon rhs range [1e+04, 1e+04]
  GenCon coe range [1e+00, 1e+00]
Presolve removed 818 rows and 415 columns
Presolve time: 0.03s
Presolved: 2584 rows, 1309 columns, 31271 nonzeros
Variable types: 1298 continuous, 11 integer (11 binary)
Found heuristic solution: objective 82615.795918

Root relaxation: objective 6.192462e+04, 1411 iterations, 0.10 seconds (0.32 work units)

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  