**Objective:** To analyze and visualize the Amsterdam Airbnb listings dataset using Pandas, handling missing values, performing descriptive statistics, and creating informative visualizations.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/amsterdam-airbnb-listings'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amsterdam-airbnb-listings/listings.csv


In [2]:
# Assign the DataFrame to the variable 
df = pd.read_csv(os.path.join('/kaggle/input/amsterdam-airbnb-listings', filename))
df.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,1489424,Home in Albany · ★4.74 · 1 bedroom · 1 bed · 1...,5294164,Efrat,,FOURTEENTH WARD,42.66719,-73.8158,Private room,50.0,1,239,2023-12-08,1.9,1,201,47,
1,2992450,Rental unit in Albany · ★3.56 · 2 bedrooms · 2...,4621559,Kenneth,,THIRD WARD,42.65789,-73.7537,Entire home/apt,70.0,28,9,2022-08-17,0.08,1,365,0,
2,3820211,Rental unit in Albany · ★4.74 · 1 bedroom · 1 ...,19648678,Terra,,SIXTH WARD,42.65222,-73.76724,Entire home/apt,125.0,2,291,2023-11-19,2.54,10,361,8,
3,5651579,Rental unit in Albany · ★4.51 · Studio · 1 bed...,29288920,Gregg,,SECOND WARD,42.64615,-73.75966,Entire home/apt,68.0,2,332,2023-11-12,3.15,3,138,28,
4,6623339,Rental unit in Albany · ★4.73 · 1 bedroom · 1 ...,19648678,Terra,,SIXTH WARD,42.65222,-73.76724,Entire home/apt,140.0,1,305,2023-12-10,2.92,10,0,4,


In [3]:
# explore  the column names of the dataset
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'number_of_reviews_ltm', 'license'],
      dtype='object')

In [4]:
# Get basic information about the data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              410 non-null    int64  
 1   name                            410 non-null    object 
 2   host_id                         410 non-null    int64  
 3   host_name                       410 non-null    object 
 4   neighbourhood_group             0 non-null      float64
 5   neighbourhood                   410 non-null    object 
 6   latitude                        410 non-null    float64
 7   longitude                       410 non-null    float64
 8   room_type                       410 non-null    object 
 9   price                           404 non-null    float64
 10  minimum_nights                  410 non-null    int64  
 11  number_of_reviews               410 non-null    int64  
 12  last_review                     361 

In [5]:
print(df.describe().T)

                                count          mean           std  \
id                              410.0  4.865518e+17  4.224464e+17   
host_id                         410.0  2.211516e+08  1.813861e+08   
neighbourhood_group               0.0           NaN           NaN   
latitude                        410.0  4.265764e+01  9.688474e-03   
longitude                       410.0 -7.377665e+01  1.935449e-02   
price                           404.0  1.229282e+02  1.041410e+02   
minimum_nights                  410.0  5.134146e+00  2.147095e+01   
number_of_reviews               410.0  5.336341e+01  9.638242e+01   
reviews_per_month               361.0  2.051690e+00  2.084601e+00   
calculated_host_listings_count  410.0  5.009756e+00  4.802699e+00   
availability_365                410.0  2.131585e+02  1.361395e+02   
number_of_reviews_ltm           410.0  1.600976e+01  2.218967e+01   
license                           0.0           NaN           NaN   

                                 

In [6]:
# checking for missing values
print(df.isnull().sum())

id                                  0
name                                0
host_id                             0
host_name                           0
neighbourhood_group               410
neighbourhood                       0
latitude                            0
longitude                           0
room_type                           0
price                               6
minimum_nights                      0
number_of_reviews                   0
last_review                        49
reviews_per_month                  49
calculated_host_listings_count      0
availability_365                    0
number_of_reviews_ltm               0
license                           410
dtype: int64


In [7]:
# Check for missing values in numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64'])
missing_numerical = numerical_cols.isnull().sum()
print(missing_numerical)

id                                  0
host_id                             0
neighbourhood_group               410
latitude                            0
longitude                           0
price                               6
minimum_nights                      0
number_of_reviews                   0
reviews_per_month                  49
calculated_host_listings_count      0
availability_365                    0
number_of_reviews_ltm               0
license                           410
dtype: int64


In [8]:
# Check for missing values (count)
category_missing = df.select_dtypes(include=['object']).isnull().sum()
print(category_missing[category_missing > 0])

last_review    49
dtype: int64


In [9]:
 warnings.simplefilter("ignore")
# Impute missing values in numerical columns using median-->consider other methods like mean, mode
for col in missing_numerical.index:
    df[col] = df[col].fillna(df[col].median())

In [10]:
 warnings.simplefilter("ignore")
# Fill missing values in categorical columns with mode (consider other methods like dropping, encoding)
category_missing.fillna(category_missing.mode(0), inplace=True)

In [11]:
# Check for missing values (count)
cat_missing = df.select_dtypes(include=['object']).isnull().sum()
print(cat_missing[cat_missing > 0])

last_review    49
dtype: int64


In [12]:
# verify missing values--# Should be 0 if all missing values handled
print(df.isnull().sum().any())  

True
