In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Loading CSV Files

In [3]:
apartment_data_path = "data/apartments.csv"
apartment_df = pd.read_csv(apartment_data_path)
apartment_df.head(10)

Unnamed: 0,id,title,source,price,currency,listing_created_on,is_active,last_modified_timestamp
0,1,"Johnson, Fowler and Johnson",Realtor,4020.95,EUR,27/03/2024,True,27/12/2021
1,2,Kelley-Johnson,Craigslist,1390.01,INR,23/07/2023,True,10/02/2023
2,3,"Gibson, Delgado and Austin",Craigslist,3082.67,INR,15/04/2024,True,02/01/2020
3,4,"Kelly, Smith and Gibson",Zillow,4724.2,USD,27/10/2020,False,23/06/2023
4,5,Carroll-Burch,Airbnb,3867.11,USD,01/02/2021,False,25/07/2022
5,6,James LLC,Zillow,2850.27,USD,20/03/2025,True,16/02/2021
6,7,"Crane, Rodriguez and Charles",Airbnb,4900.15,USD,30/01/2020,True,06/01/2021
7,8,Stewart-Abbott,Realtor,3085.36,USD,28/05/2023,False,21/08/2023
8,9,"Dean, Johnson and Stanley",Zillow,1778.69,USD,01/12/2023,False,07/04/2023
9,10,Riley Inc,Craigslist,2794.78,EUR,14/11/2021,True,20/03/2025


In [4]:
apartment_attributes_path = "data/apartment_attributes.csv"
apartment_attr_df = pd.read_csv(apartment_attributes_path)
apartment_attr_df.head(5)

Unnamed: 0,id,category,body,amenities,bathrooms,bedrooms,fee,has_photo,pets_allowed,price_display,price_type,square_feet,address,cityname,state,latitude,longitude
0,1,2BHK,Happy product model process necessary. Only fo...,"Balcony, Air Conditioning",1,1,168.84,True,False,$1588.42,Monthly,1463,"92525 Holt Turnpike Lake Keith, KY 50153",New York,California,20.457092,0.46622
1,2,Penthouse,Technology past much. Shoulder collection appr...,"Garden, Pet-friendly, Balcony",2,2,202.99,False,True,$3585.32,Yearly,722,"89610 Chang Lane Apt. 295 New Ericaland, NC 28192",Austin,Texas,38.182993,-129.769256
2,3,3BHK,Firm agreement shake design sort. Size source ...,Gym,2,1,488.93,False,False,$1961.69,Monthly,2208,"134 Bryan Island Leefort, VT 48142",Chicago,Illinois,61.585391,-92.597924
3,4,1BHK,Next may hear camera. Heart dinner onto increa...,Garden,3,3,376.1,True,False,$4845.75,Yearly,589,"13660 Amanda Isle East Michaelchester, CT 62513",San Antonio,Texas,-59.01545,6.296552
4,5,1BHK,Space necessary each statement sport early. Fi...,Pet-friendly,2,2,465.79,False,True,$2599.65,One-time,1590,"7915 Laurie Manor Port John, MN 73309",Los Angeles,California,-48.009442,-33.172952


In [5]:
user_viewings_path = "data/user_viewing.csv"
user_viewings_df = pd.read_csv(user_viewings_path)
user_viewings_df.head(5)

Unnamed: 0,user_id,apartment_id,viewed_at,is_wishlisted,call_to_action
0,5353,180678,23/01/2023,False,Reported
1,8341,12140,14/05/2021,True,Reported
2,5517,72896,27/06/2023,True,Contact Agent
3,1710,19727,15/10/2022,True,Shortlisted
4,896,44550,06/02/2020,False,Reported


In [6]:
bookings_path = "data/bookings.csv"
bookings_df = pd.read_csv(bookings_path)
bookings_df.head(5)

Unnamed: 0,booking_id,user_id,apartment_id,booking_date,checkin_date,checkout_date,total_price,currency,booking_status
0,1,2920,130940,11/11/2022,07/09/2024,30/06/2023,2923.67,EUR,confirmed
1,2,2788,191497,17/05/2024,07/07/2024,06/09/2021,4645.29,EUR,pending
2,3,9285,139117,26/11/2022,12/02/2025,07/04/2023,1683.99,INR,canceled
3,4,8161,118131,09/07/2020,25/10/2024,08/02/2021,1870.62,USD,confirmed
4,5,6733,43470,08/05/2024,31/01/2022,29/10/2024,3379.44,INR,pending


## Dataset Analysis

In [7]:
apartment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       200000 non-null  int64  
 1   title                    200000 non-null  object 
 2   source                   200000 non-null  object 
 3   price                    200000 non-null  float64
 4   currency                 200000 non-null  object 
 5   listing_created_on       200000 non-null  object 
 6   is_active                200000 non-null  bool   
 7   last_modified_timestamp  200000 non-null  object 
dtypes: bool(1), float64(1), int64(1), object(5)
memory usage: 10.9+ MB


In [8]:
# Show types of all columns
print(apartment_df.dtypes)

id                           int64
title                       object
source                      object
price                      float64
currency                    object
listing_created_on          object
is_active                     bool
last_modified_timestamp     object
dtype: object


In [8]:
apartment_attr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 17 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             200000 non-null  int64  
 1   category       200000 non-null  object 
 2   body           200000 non-null  object 
 3   amenities      200000 non-null  object 
 4   bathrooms      200000 non-null  int64  
 5   bedrooms       200000 non-null  int64  
 6   fee            200000 non-null  float64
 7   has_photo      200000 non-null  bool   
 8   pets_allowed   70956 non-null   object 
 9   price_display  200000 non-null  object 
 10  price_type     200000 non-null  object 
 11  square_feet    200000 non-null  int64  
 12  address        200000 non-null  object 
 13  cityname       200000 non-null  object 
 14  state          200000 non-null  object 
 15  latitude       200000 non-null  float64
 16  longitude      200000 non-null  float64
dtypes: bool(1), float64(3), int64

In [9]:
user_viewings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   user_id         300000 non-null  int64 
 1   apartment_id    300000 non-null  int64 
 2   viewed_at       300000 non-null  object
 3   is_wishlisted   300000 non-null  bool  
 4   call_to_action  300000 non-null  object
dtypes: bool(1), int64(2), object(2)
memory usage: 9.4+ MB


In [10]:
bookings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   booking_id      500000 non-null  int64  
 1   user_id         500000 non-null  int64  
 2   apartment_id    500000 non-null  int64  
 3   booking_date    500000 non-null  object 
 4   checkin_date    500000 non-null  object 
 5   checkout_date   500000 non-null  object 
 6   total_price     500000 non-null  float64
 7   currency        500000 non-null  object 
 8   booking_status  500000 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 34.3+ MB


In [11]:
unique_currency = bookings_df['currency'].unique()
print(unique_currency)

['EUR' 'INR' 'USD']


### changing all currency to USD

In [12]:
bookings_df['currency'] = "USD"
unique_currency = bookings_df['currency'].unique()
print(unique_currency) 

['USD']


In [14]:
apartment_df['currency'] = "USD"
unique_currency = apartment_df['currency'].unique()
print(unique_currency) 


['USD']


### Handling Missing Values

In [15]:
# Select only object (categorical) columns and count missing values
object_nulls = bookings_df.select_dtypes(include=['object']).isnull().sum()

# Display only columns that have missing values
object_nulls = object_nulls[object_nulls > 0]

print("Null values in object columns:\n", object_nulls)

Null values in object columns:
 Series([], dtype: int64)


In [16]:
# Select only object (categorical) columns and count missing values
object_nulls_user = user_viewings_df.select_dtypes(include=['object']).isnull().sum()

# Display only columns that have missing values
object_nulls_user = object_nulls_user[object_nulls_user > 0]

print("Null values in object columns:\n", object_nulls_user)


Null values in object columns:
 Series([], dtype: int64)


In [18]:
# Select only object (categorical) columns and count missing values
object_nulls_ap = apartment_df.select_dtypes(include=['object']).isnull().sum()

# Display only columns that have missing values
object_nulls_ap = object_nulls_ap[object_nulls_ap > 0]

print("Null values in object columns:\n", object_nulls_ap)

Null values in object columns:
 Series([], dtype: int64)


In [17]:
# Select only object (categorical) columns and count missing values
object_nulls_att = apartment_attr_df.select_dtypes(include=['object']).isnull().sum()

# Display only columns that have missing values
object_nulls_att = object_nulls_att[object_nulls_att > 0]

print("Null values in object columns:\n", object_nulls_att)

Null values in object columns:
 pets_allowed    129044
dtype: int64


In [20]:
apartment_attr_df.fillna(apartment_attr_df.mode().iloc[0], inplace=True)

# Ensure object columns are not unexpectedly downcasted
# apartment_attr_df = apartment_attr_df.infer_objects(copy=False)

In [21]:
apartment_attr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 17 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             200000 non-null  int64  
 1   category       200000 non-null  object 
 2   body           200000 non-null  object 
 3   amenities      200000 non-null  object 
 4   bathrooms      200000 non-null  int64  
 5   bedrooms       200000 non-null  int64  
 6   fee            200000 non-null  float64
 7   has_photo      200000 non-null  bool   
 8   pets_allowed   200000 non-null  bool   
 9   price_display  200000 non-null  object 
 10  price_type     200000 non-null  object 
 11  square_feet    200000 non-null  int64  
 12  address        200000 non-null  object 
 13  cityname       200000 non-null  object 
 14  state          200000 non-null  object 
 15  latitude       200000 non-null  float64
 16  longitude      200000 non-null  float64
dtypes: bool(2), float64(3), int64

### Checking for duplicate rows in the dataframes

In [22]:
apartment_df_duplicates = apartment_df.duplicated().sum()
print(f"\nTotal Duplicates: {apartment_df_duplicates}")


Total Duplicates: 0


In [23]:
user_viewings_df_duplicates = user_viewings_df.duplicated().sum()
print(f"\nTotal Duplicates: {user_viewings_df_duplicates}")

booking_df_duplicates = bookings_df.duplicated().sum()
print(f"\nTotal Duplicates: {booking_df_duplicates}")

apartment_attr_df_duplicates = apartment_attr_df.duplicated().sum()
print(f"\nTotal Duplicates: {apartment_attr_df_duplicates}")


Total Duplicates: 0

Total Duplicates: 0

Total Duplicates: 0
