# ðŸ§¹ Project 4: Data Cleaning â€” Airbnb NYC 2019
**Internship:** Oasis Infobyte â€” Data Analytics  
**Level:** 1  

**Goal:** Clean the Airbnb NYC dataset by handling missing values, duplicates, outliers, and incorrect data types so it is ready for analysis.

## Step 1: Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print('Libraries imported!')

## Step 2: Upload and Load the Dataset

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv('AB_NYC_2019.csv')

print('Shape of dataset:', df.shape)
df.head()

## Step 3: First Look at the Data

In [None]:
# See all column names and types
print(df.dtypes)

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check missing values in each column
print('Missing values per column:')
print(df.isnull().sum())

In [None]:
# Check for duplicate rows
print('Duplicate rows:', df.duplicated().sum())

In [None]:
# Visualize missing values as a heatmap
plt.figure(figsize=(12, 5))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Values Heatmap (Yellow = Missing)')
plt.tight_layout()
plt.savefig('missing_values.png', dpi=150)
plt.show()

## Step 4: Handle Missing Values

In [None]:
# 'name' and 'host_name' â€” fill missing with 'Unknown'
df['name'] = df['name'].fillna('Unknown')
df['host_name'] = df['host_name'].fillna('Unknown')

# 'last_review' â€” fill missing with 'No Review'
df['last_review'] = df['last_review'].fillna('No Review')

# 'reviews_per_month' â€” fill missing with 0 (no reviews means 0 per month)
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

# Confirm no more missing values
print('Missing values after cleaning:')
print(df.isnull().sum())

## Step 5: Fix Data Types

In [None]:
# Convert last_review to a proper date (where possible)
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

# Confirm data types
print(df.dtypes)

## Step 6: Remove Duplicates

In [None]:
before = len(df)
df = df.drop_duplicates()
after = len(df)

print('Rows before:', before)
print('Rows after :', after)
print('Removed    :', before - after, 'duplicate rows')

## Step 7: Detect and Handle Outliers

In [None]:
# Look at price distribution before cleaning
plt.figure(figsize=(10, 4))
plt.hist(df['price'], bins=100, color='steelblue', edgecolor='white')
plt.title('Price Distribution (Before Cleaning)')
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('price_before.png', dpi=150)
plt.show()

print('Max price:', df['price'].max())
print('Listings with price = 0:', len(df[df['price'] == 0]))

In [None]:
# Remove listings with price = 0 (not valid)
df = df[df['price'] > 0]

# Remove extreme price outliers (above $1000 per night)
df = df[df['price'] <= 1000]

print('Rows after removing price outliers:', len(df))

In [None]:
# Remove listings with unrealistic minimum nights (more than 365)
df = df[df['minimum_nights'] <= 365]

print('Rows after removing minimum_nights outliers:', len(df))

In [None]:
# Price distribution after cleaning
plt.figure(figsize=(10, 4))
plt.hist(df['price'], bins=100, color='#2ecc71', edgecolor='white')
plt.title('Price Distribution (After Cleaning)')
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('price_after.png', dpi=150)
plt.show()

## Step 8: Standardize Text Columns

In [None]:
# Make sure text columns have consistent formatting
df['neighbourhood_group'] = df['neighbourhood_group'].str.strip().str.title()
df['neighbourhood'] = df['neighbourhood'].str.strip().str.title()
df['room_type'] = df['room_type'].str.strip().str.title()

# Check unique values
print('Neighbourhood Groups:', df['neighbourhood_group'].unique())
print('Room Types:', df['room_type'].unique())

## Step 9: Quick EDA on Clean Data

In [None]:
# Listings by neighbourhood group
area_counts = df['neighbourhood_group'].value_counts()

plt.figure(figsize=(9, 5))
plt.bar(area_counts.index, area_counts.values, color='steelblue')
plt.title('Number of Listings by Area')
plt.xlabel('Neighbourhood Group')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('listings_by_area.png', dpi=150)
plt.show()

In [None]:
# Average price by neighbourhood group
avg_price = df.groupby('neighbourhood_group')['price'].mean().sort_values(ascending=False)

plt.figure(figsize=(9, 5))
plt.bar(avg_price.index, avg_price.values, color='#e67e22')
plt.title('Average Price by Neighbourhood Group')
plt.xlabel('Area')
plt.ylabel('Average Price ($)')
plt.tight_layout()
plt.savefig('avg_price_by_area.png', dpi=150)
plt.show()

In [None]:
# Room type distribution
room_counts = df['room_type'].value_counts()

plt.figure(figsize=(7, 5))
plt.pie(room_counts.values, labels=room_counts.index, autopct='%1.1f%%',
        colors=['#3498db', '#e74c3c', '#2ecc71'], startangle=90)
plt.title('Room Type Distribution')
plt.tight_layout()
plt.savefig('room_types.png', dpi=150)
plt.show()

In [None]:
# Boxplot â€” price by room type
plt.figure(figsize=(9, 5))
df.boxplot(column='price', by='room_type')
plt.title('Price by Room Type')
plt.suptitle('')
plt.xlabel('Room Type')
plt.ylabel('Price ($)')
plt.tight_layout()
plt.savefig('price_by_room.png', dpi=150)
plt.show()

## Step 10: Save the Cleaned Dataset

In [None]:
df.to_csv('AB_NYC_2019_cleaned.csv', index=False)
print('Cleaned dataset saved as AB_NYC_2019_cleaned.csv')

# Download it
from google.colab import files
files.download('AB_NYC_2019_cleaned.csv')

## Step 11: Cleaning Summary

In [None]:
print('========== DATA CLEANING SUMMARY ==========')
print('Original rows          : 48,895')
print('Rows after cleaning    :', len(df))
print()
print('Issues Fixed:')
print('  - Filled missing name and host_name with Unknown')
print('  - Filled missing last_review with No Review')
print('  - Filled missing reviews_per_month with 0')
print('  - Converted last_review to proper date format')
print('  - Removed duplicate rows')
print('  - Removed listings with price = 0')
print('  - Removed price outliers above $1000')
print('  - Removed minimum_nights above 365')
print('  - Standardized text columns (strip + title case)')
print()
print('Dataset is now clean and ready for analysis!')

---
**âœ… Project 4 Complete!**  
*Submitted as part of Oasis Infobyte Data Analytics Internship*