In [2]:
# Import libraries
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("C:/Users/dhiya/Downloads/archive (10)/AB_NYC_2019.csv")

In [4]:
# Inspect the data
# ----------------------------
print("Shape:", df.shape)
print("\nData types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())
print("\nBasic info:\n")
print(df.info())
print("\nSample rows:\n")
print(df.head())

Shape: (48895, 16)

Data types:
 id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

Missing values:
 id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude               

In [6]:
# Data Integrity & Basic Checks
# ----------------------------
# Check for unique ID values
duplicate_ids = df['id'].duplicated().sum()
print(f"\nDuplicate IDs found: {duplicate_ids}")


Duplicate IDs found: 0


In [8]:
# Ensure latitude and longitude are valid ranges
df = df[(df['latitude'].between(-90, 90)) & (df['longitude'].between(-180, 180))]

In [10]:
#  fill missing host_name with "Unknown"
df['host_name'] = df['host_name'].fillna("Unknown")
df['name'] = df['name'].fillna("Unknown")

#  fill missing reviews_per_month with 0 (assuming no reviews)
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

#  last_review is missing, fill with 'No Review'
df['last_review'] = df['last_review'].fillna('No Review')

In [14]:
df.isnull().sum()

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [16]:
# Standardization
# ----------------------------
# Convert all text columns to lowercase for consistency
df['host_name'] = df['host_name'].str.lower()
df['neighbourhood_group'] = df['neighbourhood_group'].str.lower()
df['neighbourhood'] = df['neighbourhood'].str.lower()
df['room_type'] = df['room_type'].str.lower()

In [18]:
# Trim extra spaces
df['host_name'] = df['host_name'].str.strip()
df['neighbourhood'] = df['neighbourhood'].str.strip()

In [22]:
# Outlier Detection & Handling
# ----------------------------
# Detect outliers in price using IQR method
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

print(f"\nPrice range before removing outliers: {df['price'].min()} - {df['price'].max()}")


Price range before removing outliers: 0 - 10000


In [24]:
# Remove extreme outliers
df = df[(df['price'] >= lower_limit) & (df['price'] <= upper_limit)]

print(f"Price range after removing outliers: {df['price'].min()} - {df['price'].max()}")

Price range after removing outliers: 0 - 334


In [26]:
#  Data Consistency & Final Check
# ----------------------------
print("\nAfter cleaning:")
print(df.info())
print("\nNull values:\n", df.isnull().sum())


After cleaning:
<class 'pandas.core.frame.DataFrame'>
Index: 45923 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              45923 non-null  int64  
 1   name                            45923 non-null  object 
 2   host_id                         45923 non-null  int64  
 3   host_name                       45923 non-null  object 
 4   neighbourhood_group             45923 non-null  object 
 5   neighbourhood                   45923 non-null  object 
 6   latitude                        45923 non-null  float64
 7   longitude                       45923 non-null  float64
 8   room_type                       45923 non-null  object 
 9   price                           45923 non-null  int64  
 10  minimum_nights                  45923 non-null  int64  
 11  number_of_reviews               45923 non-null  int64  
 12  last_review         

In [74]:
# Save the cleaned dataset
# ----------------------------
df.to_csv("AB_NYC_2019_cleaned.csv", index=False)
print("\nCleaned dataset saved as 'AB_NYC_2019_cleaned.csv'")



Cleaned dataset saved as 'AB_NYC_2019_cleaned.csv'
