In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
df = pd.read_csv('AB_NYC_2019.csv')

In [3]:
print("Initial Data Info:")
df.info()

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review  

In [4]:
print("\nDataset Statistics:")
print(df.describe())


Dataset Statistics:
                 id       host_id      latitude     longitude         price  \
count  4.889500e+04  4.889500e+04  48895.000000  48895.000000  48895.000000   
mean   1.901714e+07  6.762001e+07     40.728949    -73.952170    152.720687   
std    1.098311e+07  7.861097e+07      0.054530      0.046157    240.154170   
min    2.539000e+03  2.438000e+03     40.499790    -74.244420      0.000000   
25%    9.471945e+06  7.822033e+06     40.690100    -73.983070     69.000000   
50%    1.967728e+07  3.079382e+07     40.723070    -73.955680    106.000000   
75%    2.915218e+07  1.074344e+08     40.763115    -73.936275    175.000000   
max    3.648724e+07  2.743213e+08     40.913060    -73.712990  10000.000000   

       minimum_nights  number_of_reviews  reviews_per_month  \
count    48895.000000       48895.000000       38843.000000   
mean         7.029962          23.274466           1.373221   
std         20.510550          44.550582           1.680442   
min          1.

In [6]:
df['price'] = pd.to_numeric(df['price'], errors='coerce')


In [7]:
print("\nMissing Data Summary:")
print(df.isnull().sum())



Missing Data Summary:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64


In [10]:

df['price'] = df['price'].fillna(df['price'].mean())


In [12]:
df.dropna(thresh=len(df.columns) - 1, inplace=True) 

In [13]:
print("\nNumber of Duplicate Rows:")
print(df.duplicated().sum())



Number of Duplicate Rows:
0


In [14]:
# Standardizing string columns 
df['room_type'] = df['room_type'].str.strip().str.lower()
df['host_name'] = df['host_name'].str.strip().str.title()  


In [15]:
df = df[(df['latitude'].between(-90, 90)) & (df['longitude'].between(-180, 180))]

In [17]:
# OUTLIER DETECTION
z_scores = stats.zscore(df['price'])
df = df[(np.abs(z_scores) < 3)] 

In [18]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1


In [19]:
# Filter out outliers beyond 1.5 * IQR
df = df[~((df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR)))]

In [20]:
# Final inspection of the cleaned dataset
print("\nCleaned Data Info:")
df.info()


Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 36707 entries, 0 to 48852
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              36707 non-null  int64  
 1   name                            36701 non-null  object 
 2   host_id                         36707 non-null  int64  
 3   host_name                       36691 non-null  object 
 4   neighbourhood_group             36707 non-null  object 
 5   neighbourhood                   36707 non-null  object 
 6   latitude                        36707 non-null  float64
 7   longitude                       36707 non-null  float64
 8   room_type                       36707 non-null  object 
 9   price                           36707 non-null  int64  
 10  minimum_nights                  36707 non-null  int64  
 11  number_of_reviews               36707 non-null  int64  
 12  last_review      

In [23]:
df.to_csv('cleaned_dataset.csv', index=False)

In [24]:
print("Data cleaning process complete! Cleaned data saved to 'cleaned_dataset.csv'.")

Data cleaning process complete! Cleaned data saved to 'cleaned_dataset.csv'.
