In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('listings.csv')

In [None]:
# Check for duplicates
duplicates = data.duplicated().sum()

# Check for nulls
nulls = data.isnull().sum()
totalNulls = sum(nulls)

# Outlier detection using IQR
Q1 = data['landSize'].quantile(0.25)
Q3 = data['landSize'].quantile(0.75)
IQR = Q3 - Q1
outliers = data[(data['landSize'] < (Q1 - 1.5 * IQR)) | (data['landSize'] > (Q3 + 1.5 * IQR))]

# Usable percentage
usablePercentage = (data.notnull().sum() / len(data)) * 100

print(f"Duplicates: ")
print(duplicates)
print(" ")
print(f"Nulls: ")
print(nulls)
print(f"totalNulls: ", totalNulls)
print(" ")
print(f"Outliers")
print(outliers)
print(" ")
print(f"Usable percentage: ")
print(usablePercentage)

In [None]:
data['_ptype'].fillna('NA')
data['bedRooms'].fillna('NA')
data['bathRooms'].fillna('NA')
data['landSize'].fillna('NA')
data['buildingSize'].fillna('NA')
data['price'].fillna('NA')
data['garages'].fillna('NA')
data['kabkoId'].fillna('NA')
data['sold'].fillna('NA')

In [None]:
# Statistical summary
propertyTypeSummary = {
        'House': data['_ptype'].value_counts()['house'],
        'Apartment': data['_ptype'].value_counts()['apartment'],
        'NA': data['_ptype'].isnull().sum()
    }

listingTypeSummary = {
        'Sale' : data['_ltype'].value_counts()['sale'],
        'Rent' : data['_ltype'].value_counts()['rent'],
        'NA': data['_ltype'].isnull().sum()
    }

kabkoSummary = {
        '608': data['kabkoId'].value_counts()[608],
        '618': data['kabkoId'].value_counts()[618],
        'NA': data['kabkoId'].isnull().sum()
    }

bedroomsSummary = {
        'mean': data['bedRooms'].mean(),
        'median': data['bedRooms'].median(),
        'min': data['bedRooms'].min(),
        'max': data['bedRooms'].max()
    }

bathroomsSummary = {
        'mean': data['bathRooms'].mean(),
        'median': data['bathRooms'].median(),
        'min': data['bathRooms'].min(),
        'max': data['bathRooms'].max()
    }

landsizeSummary = {
        'mean': data['landSize'].mean(),
        'median': data['landSize'].median(),
        'min': data['landSize'].min(),
        'max': data['landSize'].max()
    }

buildingsizeSummary = {
        'mean': data['buildingSize'].mean(),
        'median': data['buildingSize'].median(),
        'min': data['buildingSize'].min(),
        'max': data['buildingSize'].max()
  }

garagesizeSummary = {
        '0': data['garages'].value_counts()[0],
        '1': data['garages'].value_counts()[1],
        '2': data['garages'].value_counts()[2],
        '3': data['garages'].value_counts()[3],
        '4': data['garages'].value_counts()[4],
        'NA': data['garages'].isnull().sum()
    }

priceSummary = {
        'mean': data['price'].mean(),
        'median': data['price'].median(),
        'min': data['price'].min(),
        'max': data['price'].max()
    }

soldSummary = {
        'sold': data['sold'].value_counts()['yes'],
        'notSold': data['sold'].value_counts()['no'],
        'NA': data['sold'].isnull().sum()
    }
print(f"listingTypeSummary: ", propertyTypeSummary)
print(" ")
print(f"propertySummary: ", listingTypeSummary)
print(" ")
print(f"kabkoSummary: ")
print(kabkoSummary)
print(" ")
print(f"bedroomsSummary: ", bedroomsSummary)
print(" ")
print(f"bathroomsSummary: ", bathroomsSummary)
print(" ")
print(f"landsizeSummary: ", landsizeSummary)
print(" ")
print(f"buildingsizeSummary: ", buildingsizeSummary)
print(" ")
print(f"garagesizeSummary: ", garagesizeSummary)
print(" ")
print(f"priceSummary: ", priceSummary)
print(" ")
print(f"soldSummary: ", soldSummary)

In [None]:
#PropertyType
labels = ['House', 'Apartment', 'NA']
values = [propertyTypeSummary['House'], propertyTypeSummary['Apartment'], propertyTypeSummary['NA']]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, values, color=['lightblue', 'pink', 'green'])
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.0f}', va='bottom', ha='center')

plt.title('Summary of Property Type')
plt.xlabel('Property Type')
plt.ylabel('Amount')
plt.ylim(0, max(values) + 50)
plt.show()

In [None]:
#ListingType
labels = ['Sale', 'Rent', 'NA']
values = [listingTypeSummary['Sale'], listingTypeSummary['Rent'], listingTypeSummary['NA']]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, values, color=['lightblue', 'pink', 'green'])
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.0f}', va='bottom', ha='center')

plt.title('Summary of Listing Type')
plt.xlabel('Property Type')
plt.ylabel('Amount')
plt.ylim(0, max(values) + 50)
plt.show()

In [None]:
#kabkoId
labels = ['608', '618', 'NA']
values = [kabkoSummary['608'], kabkoSummary['618'], kabkoSummary['NA']]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, values, color=['lightblue', 'pink', 'green'])
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.0f}', va='bottom', ha='center')

plt.title('Summary of kabkoId')
plt.xlabel('kabkoId')
plt.ylabel('Amount')
plt.ylim(0, max(values) + 50)
plt.show()

In [None]:
#bedrooms
labels = ['Mean', 'Median', 'Min', 'Max']
values = [bedroomsSummary['mean'], bedroomsSummary['median'], bedroomsSummary['min'], bedroomsSummary['max']]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, values, color=['lightblue', 'orange', 'green', 'pink'])
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.0f}', va='bottom', ha='center')

plt.title('Summary of Number of Bedrooms')
plt.ylabel('Number of Bedrooms')
plt.ylim(0, max(values) + 1)
plt.show()

In [None]:
#bathrooms
labels = ['Mean', 'Median', 'Min', 'Max']
values = [bathroomsSummary['mean'], bathroomsSummary['median'], bathroomsSummary['min'], bathroomsSummary['max']]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, values, color=['lightblue', 'orange', 'green', 'pink'])
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.0f}', va='bottom', ha='center')

plt.title('Summary of Number of Bathrooms')
plt.ylabel('Number of Bathrooms')
plt.ylim(0, max(values) + 1)
plt.show()

In [None]:
#landSize
labels = ['Mean', 'Median', 'Min', 'Max']
values = [landsizeSummary['mean'], landsizeSummary['median'], landsizeSummary['min'], landsizeSummary['max']]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, values, color=['lightblue', 'orange', 'green', 'pink'])
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2f}', va='bottom', ha='center')
plt.title('Summary of Land Size')
plt.ylabel('Land Size')
plt.ylim(0, max(values) + 1100)
plt.show()

In [None]:
#buildingSize
labels = ['Mean', 'Median', 'Min', 'Max']
values = [buildingsizeSummary['mean'], buildingsizeSummary['median'], buildingsizeSummary['min'], buildingsizeSummary['max']]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, values, color=['lightblue', 'orange', 'green', 'pink'])

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2f}', va='bottom', ha='center')

plt.title('Summary of Building Size')
plt.ylabel('Building Size')
plt.ylim(0, max(values) + 500)
plt.show()

In [None]:
#garage
labels = ['0', '1', '2', '3', '4', 'NA']
values = [garagesizeSummary['0'], garagesizeSummary['1'], garagesizeSummary['2'], garagesizeSummary['3'], garagesizeSummary['4'], garagesizeSummary['NA']]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, values, color=['lightblue', 'orange', 'red', 'pink', 'green', 'darkblue'])
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.0f}', va='bottom', ha='center')

plt.title('Summary of Garage')
plt.xlabel('Total of Garage')
plt.ylabel('Count')
plt.ylim(0, max(values) + 50)
plt.show()

In [None]:
#price
labels = ['Mean', 'Median', 'Min', 'Max']
values = [priceSummary['mean'], priceSummary['median'], priceSummary['min'], priceSummary['max']]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, values, color=['lightblue', 'orange', 'green', 'pink'])

plt.ticklabel_format(style='plain', axis='y')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'Rp{yval:,.0f}', va='bottom', ha='center')

plt.title('Summary of Property Price')
plt.ylabel('Price (Rp)')
plt.xlabel('Price Statistics')
plt.show()

In [None]:
labels = ['Sold', 'Not Sold', 'NA']
values = [soldSummary['sold'], soldSummary['notSold'], soldSummary['NA']]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, values, color=['lightblue', 'pink', 'green'])
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.0f}', va='bottom', ha='center')

plt.title('Summary of Detail Type')
plt.xlabel('Available')
plt.ylabel('Count')
plt.ylim(0, max(values) + 50)
plt.show()