# Data cleaning

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dgomonov/new-york-city-airbnb-open-data")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'new-york-city-airbnb-open-data' dataset.
Path to dataset files: /kaggle/input/new-york-city-airbnb-open-data


In [6]:
import pandas as pd
import kagglehub

# Datasetni yuklab olish
path = kagglehub.dataset_download("dgomonov/new-york-city-airbnb-open-data")
print("Dataset yuklab olindi:", path)

# Asosiy CSV fayl manzili
file_path = f"{path}/AB_NYC_2019.csv"

# CSV faylni o‘qish
df = pd.read_csv(file_path)

# ------------------------------
# 1. Dastlabki ma’lumotlar bilan tanishish
# ------------------------------
print(df.head())            # Birinchi 5 qatorni ko‘rish
print(df.info())            # Ustunlar haqida umumiy ma’lumot
print(df.isnull().sum())    # Qaysi ustunda nechta bo‘sh qiymat bor

# ------------------------------
# 2. Takrorlangan satrlarni o‘chirish
# ------------------------------
df = df.drop_duplicates(subset='id', keep='first')

# ------------------------------
# 3. Bo‘sh (NaN) qiymatlarni to‘g‘rilash
# ------------------------------

# reviews_per_month ustunidagi bo‘sh qiymatlarni 0 bilan to‘ldirish
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

# host_name ustuni tahlil uchun kerak emas — o‘chirib tashlash
df = df.drop(columns=['host_name'], errors='ignore')

# last_review ustunini datetime formatiga o‘tkazish
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

# ------------------------------
# 4. Ma’lumot turlarini to‘g‘rilash
# ------------------------------
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# ------------------------------
# 5. Noto‘g‘ri yoki imkonsiz qiymatlarni tozalash
# ------------------------------

# price ustunida 0 dan kichik yoki teng bo‘lgan qiymatlarni o‘chirish
df = df[df['price'] > 0]

# minimum_nights juda katta bo‘lsa — o‘chirib tashlash (outlier)
df = df[df['minimum_nights'] < 365]

# availability_365 qiymati 0–365 oralig‘ida bo‘lishi kerak
df = df[df['availability_365'].between(0, 365)]

# ------------------------------
# 6. Tozalangan datasetni saqlash
# ------------------------------
cleaned_path = "AB_NYC_2019_cleaned.csv"  # Foydalanuvchi ishlata oladigan joyga saqlash
df.to_csv(cleaned_path, index=False)

print("Tozalash jarayoni yakunlandi!")
print("Toza dataset saqlandi:", cleaned_path)


Using Colab cache for faster access to the 'new-york-city-airbnb-open-data' dataset.
Dataset yuklab olindi: /kaggle/input/new-york-city-airbnb-open-data
     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !     4632   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -