In [None]:
#https://public.opendatasoft.com/explore/dataset/airbnb-listings/export/?disjunctive.host_verifications&disjunctive.amenities&disjunctive.features&q=sydney&refine.country=Australia&refine.city=Sydney&location=10,-33.85368,151.12545&basemap=jawg.light
print('initialising')
#Displays "initialising" when collecting all the data
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#imporing Modules
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
#Imports classes from libraries



In [None]:
df = pd.read_csv("assets/sydneyAirBnB.csv", encoding='latin-1')
#Reading the Sydney AirBnB.csv

#len(df.columns)
#df.dtypes
#df.info()

#Displays what form the data is in for each column

Drop Categorical Data less relevant to the expected outcome

In [None]:
reduced = df.drop(columns=["Neighbourhood","Square Feet","Calendar Updated","Calendar last Scraped","Availability 30","Weekly Price","Monthly Price","Has Availability","First Review","Last Review","Cancellation Policy","Jurisdiction Names","License","Smart Location","Country","Country Code","Market","Zipcode","Host Acceptance Rate","State","Neighbourhood Cleansed","Neighbourhood Group Cleansed","City","Host Verifications","Street",'ID','Listing Url','Scrape ID','Last Scraped','Name','Summary','Space','Description','Experiences Offered',"Neighborhood Overview","Notes","Transit","Host Response Time","Host Thumbnail Url","Host Picture Url","Host Neighbourhood","Access","Interaction","House Rules","Thumbnail Url","Medium Url","Picture Url","XL Picture Url","Host ID","Host URL","Host Name","Host Since","Host Location","Host About","Host Thumbnail Url","Availability 60","Availability 90","Geolocation","Features"],axis=1)
reduced=reduced.dropna()

reduced.info()
#displays Data

Pearson Correlation of Features to see what has higher influence on price

In [None]:
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features')
sns.heatmap(reduced.corr(), linewidths=1,
           square=True, cmap='RdBu', linecolor='white', annot=True)

dummifying some key categorical features, such as "Property Type"

In [None]:
dummified = pd.get_dummies(reduced)
#print(reduced)
#print(dummified)


dummified=dummified.fillna(0)

dummified = dummified.astype({"Property Type_Apartment": 'int64', 
                              "Property Type_Bed & Breakfast": 'int64',	
                              "Property Type_House": 'int64',	
                              "Room Type_Entire home/apt": 'int64',
                              "Room Type_Private room": 'int64', 
                              "Room Type_Shared room": 'int64',
                              "Price":'int64'})

#dummified.head()

The following is a different way of dummifying, where keywords are extracted from features, and used for dummification.

1. Extract key words
2. Turn it into a frequency table
3. Sort from Descending Order - so the top X number of Features can be found
4. Create new columns, so if certain amenities exist, the value will be 1, otherwise 0

In [None]:
amenities_list = list(reduced["Amenities"])
amenities_list_string = " ".join(amenities_list)
amenities_list_string = amenities_list_string.replace('{', '')
amenities_list_string = amenities_list_string.replace('}', ',')
amenities_list_string = amenities_list_string.replace('"', '')

amenities_set = [x.strip() for x in amenities_list_string.split(',')]
#amenities_set = set(amenities_set)
#print(amenities_set)

counter = Counter(amenities_set)

data = {"Amenities": counter.keys(), "freq":counter.values()}
freq_table = pd.DataFrame(data)
#print(counter)
freq_table = freq_table.sort_values(by=['freq'],ascending=False).head(20)

#top 20 amenities

for i in range(20):
    Amenity = freq_table.iloc[i]['Amenities']
    #print(Amenity)
    
    dummified.loc[reduced['Amenities'].str.contains(Amenity),"amenities: "+Amenity] = int(1);
    

dummified = dummified.fillna(0)

dummified.head()

dummified.dtypes

Final Data inspection - this can be exported to csv by un-commenting the first line under (dummified.to_csv)

In [None]:
dummified.to_csv('data.csv', index=False)
dummified.dtypes

This is the heart of this analysis - machine learning.

The following uses the simplest machine learning - Linear Regression.

Accuracy is the R^2 value -> the closer to 1, the better.

The plot is the predicted value vs actual value.

In [None]:
scaler = MinMaxScaler()
targetVar = "Price"

x = dummified.drop(targetVar, axis=1)
y = dummified[targetVar]

x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=42,test_size=0.2)

model = LinearRegression()

#model = KNeighborsClassifier(n_neighbors=10)
model.fit(x_train, y_train)\

# Evaluate the model on the test data
accuracy = model.score(x_test, y_test)

print("accuracy: ")
print(accuracy)

# Predict Response corresponding to Predictors
trainPredictionLR = model.predict(x_train)
testPredictionLR = model.predict(x_test)

# Plot the Predictions vs the True values

f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, trainPredictionLR, color = "blue")
axes[0].plot(y_train, y_train, 'g-', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, testPredictionLR, color = "red")
axes[1].plot(y_test, y_test, 'g-', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")



Can you improve from this?

- inspect initial columns omitted - could some of them have additional value?
- Following Pearson Correlation - are there some minor factors that can be brought back?

- We only considered top 20 amenities. What are some other columns that can be further dummified?