In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("clening data.csv")
print(df.info())   # Check data types
print(df.describe())  # Summary stats for validation

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [15]:
# Drop rows with missing values
df = df.dropna()   # drops any row with at least one NaN
# Or impute with median
# Median of numeric columns
medians = df.median(numeric_only=True)

# Fill NaN with median values
df.fillna(medians, inplace=True)

In [16]:
df = df.drop_duplicates()

In [21]:
# Safe way for categorical column
df.loc[:, 'neighbourhood'] = df['neighbourhood'].str.lower()
# Create the scaler object
scaler = StandardScaler()

# Safe way for numeric scaling
df.loc[:, ['price','host_id']] = scaler.fit_transform(df[['price','host_id']])
cols_to_scale = ['price']   # only continuous variables
df.loc[:, cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

In [22]:
# IQR method
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

outliers = df[(df['price'] < (Q1 - 1.5 * IQR)) | (df['price'] > (Q3 + 1.5 * IQR))]
print("Outliers detected:", outliers)

# Remove outliers
df = df[~df.index.isin(outliers.index)]

Outliers detected:              id                                               name   host_id  \
20         7801                   Sweet and Spacious Brooklyn Loft -0.843682   
82        19169                   Entire 2 Bedroom - Large & Sunny -0.842988   
191       48719  Designer 1 BR Duplex w/ Terrace- Spectacular V... -0.840999   
223       57754                       Stylish Large Gramercy Loft! -0.840281   
254       62925                        Beautiful Landmarked Duplex -0.839864   
...         ...                                                ...       ...   
47792  35916694        New Sophisticated 4BR/2.5BA NYC Midtown Apt  0.696030   
47840  35943649     SoHa Bliss - Central Park North 2 Bed / 1 Bath  2.769880   
47994  36029624            Lovely Two Bedroom Next to Central Park  2.083722   
48057  36062290          True 1 bedroom, with add'l pull out couch -0.717168   
48096  36077745        AMAZING LARGE ONE BEDROOM IN WILLIAMSBURG!!  2.722599   

      host_name neig

In [23]:
df.to_csv("cleaned_dataset.csv", index=False)