In [None]:
# Imports of necessary libraries
import pandas as pd
import numpy as np
import random as rd
import seaborn as sns
from sklearn.cluster import KMeans 
import matplotlib.pyplot as plt

In [None]:
# Read the CSV file into a DataFrame
df = pd.read_csv('AirbnbIstanbul.csv')

In [None]:
# Display the first 5 rows of the DataFrame
df.head()

In [None]:
# Get information about the columns in the DataFrame
df.info()

In [None]:
# Checking how many null values each attribute has

df.isnull().sum().sort_values(ascending=False)

In [None]:
# Dropping unusable columns

del df["neighbourhood_group"];
del df["last_review"];

In [None]:
# Dropping rows that have null in name and host_name attributes

df.dropna(subset=["name", "host_name"], inplace=True)

In [None]:
# Filling reviews with that are NaN with 0

df["reviews_per_month"].fillna(0, inplace=True)

In [None]:
# Checking how many duplicated rows we have (rows that have every attribute of the same value)

df.duplicated().sum()

In [None]:
df.head()

In [None]:
# Rechecking the data after cleaning

df.info()

In [None]:
# Analytical information about the price

df['price'].describe()

In [None]:
# Taking a look at the records that have price 0

free_accommodations = df[df.price==0]
free_accommodations.head()

In [None]:
# Removing records that have price 0 because that is impossible and thus making our analytical information incorrect

df.drop(df.price[df.price<=0].index,axis=0,inplace=True)
df['price'].describe()

In [None]:
df[df.price>10000].sort_values(by="price", ascending=False).head(10)

In [None]:
# Removing price outliers using IQR method

# Calculate the 25th and 75th percentiles
q25 = df["price"].quantile(0.25)
q75 = df["price"].quantile(0.75)

# Calculate the interquartile range
iqr = q75 - q25

# Define the acceptable range for the price data
lower_bound = q25 - (1.5 * iqr)
upper_bound = q75 + (1.5 * iqr)

# Remove outliers from the price data
df = df.loc[(df["price"] > lower_bound) & (df["price"] < upper_bound)]

# Checking the price data
df['price'].describe()


In [None]:
# Checking how many records of each room type we have

df["room_type"].value_counts()

In [None]:
# Accommodations with most listings

chart1 = df["name"].value_counts()
chart1.head(10).plot(kind="bar", title="Accommodations with most listings")

In [None]:
# Neighbourhoods with most accommodations

chart2 = df.groupby(['neighbourhood']).size().sort_values(ascending=False)
chart2.head(10).plot(kind="pie", title="Neighbourhoods with most accommodations")

In [None]:
# Average price of neighbourhood

chart3 = df.groupby("neighbourhood")["price"].agg(['mean']).sort_values(by="mean", ascending=False)
chart3.plot(kind="bar", title="Mean Price of Neighbourhoods")

In [None]:
# Busiest hosts (the ones that have the most amount of reviews)

chart4 = df.groupby(['host_id', 'name', 'room_type'])['number_of_reviews'].max().sort_values(ascending=False)
chart4.head(10).plot(kind="bar")

In [None]:
# Plotting the locations of neighbourhoods
plt.figure(figsize=(15,15))
sns.scatterplot(x=df['longitude'],y=df['latitude'],hue=df['neighbourhood']).set_title('Density of Hotels')

In [None]:
# Comparing price to the busyness

chart5 = df.groupby("price")["number_of_reviews"].count().reset_index()

fig = plt.figure(figsize = (10, 5))
plt.scatter(chart5["price"], chart5["number_of_reviews"])

plt.xlabel("Area")
plt.ylabel("Reviews")
plt.title("Price vs Busyness")
plt.show()

In [None]:
# Calculating the mean availability of neighbourhoods

chart6 = df.groupby("neighbourhood")["availability_365"].mean().sort_values().reset_index()
chart6.head(10).plot(x="neighbourhood", y="availability_365", kind="bar", title="Mean availability of neighbourhoods")

In [None]:
# Plot correlations

df_correlation = df[["latitude", "longitude", "price", "minimum_nights", "number_of_reviews", "reviews_per_month", "calculated_host_listings_count", "availability_365"]]

plt.figure(figsize=(8,8))
sns.heatmap(df_correlation.corr(), cmap='Accent', annot=True)
plt.title('Heatmap Showing Correlations of Numbered Attributes')

In [None]:
unique_neighbourhoods = df['neighbourhood'].unique()
unique_roomtypes = df['room_type'].unique()

print(unique_neighbourhoods)
print(unique_roomtypes)

In [None]:
df.describe()

In [None]:
df['minimum_nights'].plot()
plt.title('Minimum Nights')
plt.xlabel('Listing')
plt.ylabel('Minimum Nights')
plt.show()

In [None]:
# Dropping the outliers

df.drop(df.price[df.minimum_nights>365].index,axis=0,inplace=True)

In [None]:
df['minimum_nights'].plot()
plt.figure(figsize=(8,8))
plt.title('Minimum Nights')
plt.xlabel('Listing')
plt.ylabel('Minimum Nights')
plt.show()

In [None]:
# Remove listings which are not available 

df.drop(df.price[df.availability_365 == 0].index,axis=0,inplace=True)

df.describe()

In [None]:
# Create normalized price (price per night)

df['price_normalized'] = df['price']/df['minimum_nights']

df.head(20)

In [None]:
# Cluster listings by the availability of the property, to understand the busiest and
# slowest times of the year and how this impacts the availability of listings

In [None]:
# Extract the columns that contain the information we need for clustering

x = df[['price', 'availability_365']]

In [None]:
k_values = [2, 3, 4, 5, 6, 7, 8, 9, 10]
wcss_values = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(x)
    wcss = kmeans.inertia_
    wcss_values.append(wcss)

In [None]:
wcss = kmeans.inertia_

print(k_values)
print(wcss_values)

In [None]:
plt.plot(k_values, wcss_values)
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Create a K-Means clustering model with the 
# number of clusters set to 4 based on the elbow method

kmeans = KMeans(n_clusters=4, n_init=10)

In [None]:
# Fit the K-Means model to the data

kmeans.fit(x)

In [None]:
# Predict cluster labels for each listing in the dataset.

clusters = kmeans.predict(x)

In [None]:
# Add cluster number to the original dataset to the each record

df['cluster'] = clusters

In [None]:
# Use groupby method to group the listings by their cluster label
# then use the mean method to calculate the average availability for each cluster.

avg_availability_by_cluster = df.groupby('cluster')['availability_365'].mean()

In [None]:
# Create a bar plot

avg_availability_by_cluster.plot(kind='bar')
plt.title('Average Availability by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Availability (days per year)')
plt.show()

In [None]:
# Filter the data to only show listings that are part of the cluster with the most listings
most_listings_cluster = avg_availability_by_cluster.idxmax()
least_listings_cluster = avg_availability_by_cluster.idxmin()

In [None]:
# Display summary statistics about the listings in the cluster with the most listings
df[df.cluster == most_listings_cluster]["price"].mean()

In [None]:
# Display summary statistics about the listings in the cluster with the least listings
df[df.cluster == least_listings_cluster]["price"].mean()