## Data Cleaning with Python

by: Serhii Viskushenko

## Load libraries

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt

## Load data

In [8]:
# data from github repo: https://github.com/dyvenia/data_cleaning
url = 'https://raw.githubusercontent.com/dyvenia/lms/data_cleaning/data_cleaning/dataset.csv?token=GHSAT0AAAAAABZPXVXN6MSAUMN73MYDSIMIY2X6XLQ'
df = pd.read_csv(url)

## Explore data

In [6]:
#This function returns the first n rows (default 5) for the object based on position. It is useful for quickly testing if your object has the right type of data in it.
df.head()

Unnamed: 0,id,NamE,host ID,host_name,neighbourhood_group,neighbourhood,latitude,longitude,Room Type,price,minimum_nights,number_of_reviews,LAST review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,,2787,,Brooklyn,Kensington,40.64749,-73.97237,Private room,$149,1,9,43392,,6,365
1,2595,Skylit Midtown Castle,2845,,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,$225,1,45,43606,0.38,2,355
2,3831,,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,$89,1,270,43651,,1,194
3,5022,,7192,Laura,Manhatan,East Harlem,40.79851,-73.94399,Entire home/apt,$80,10,9,43423,,1,0
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,$200,3,74,43638,0.59,1,129


In [6]:
#This function returns last n rows (default 5) from the object based on position. It is useful for quickly verifying data, for example, after sorting or appending rows.
df.tail()

Unnamed: 0,id,NamE,host ID,host_name,neighbourhood_group,neighbourhood,latitude,longitude,Room Type,price,minimum_nights,number_of_reviews,LAST review,reviews_per_month,calculated_host_listings_count,availability_365
38842,36425863,Lovely Privet Bedroom with Privet Restroom,83554966,Rusaa,Manhattan,Upper East Side,40.78099,-73.95366,Private room,$129,1,1,43653,1.0,1,147
38843,36427429,No.2 with queen size bed,257683179,H Ai,Queens,Flushing,40.75104,-73.81459,Private room,$45,1,1,43653,1.0,6,339
38844,36438336,Seas The Moment,211644523,Ben,Staten Island,Great Kills,40.54179,-74.14275,Private room,$235,1,1,43653,1.0,1,87
38845,36442252,1B-1B apartment near by Metro,273841667,Blaine,Bronx,Mott Haven,40.80787,-73.924,Entire home/apt,$100,1,2,43653,2.0,1,40
38846,36455809,"Cozy Private Room in Bushwick, Brooklyn",74162901,Christine,Brooklyn,Bushwick,40.69805,-73.92801,Private room,$30,1,1,43654,1.0,1,1


In [7]:
#Return a random sample of items from an axis of object. (default 1)
df.sample(5)

Unnamed: 0,id,NamE,host ID,host_name,neighbourhood_group,neighbourhood,latitude,longitude,Room Type,price,minimum_nights,number_of_reviews,LAST review,reviews_per_month,calculated_host_listings_count,availability_365
7518,6571441,Brooklyn Brownstone Master Bedroom,31458100,Drew,Brooklyn,Bedford-Stuyvesant,40.68527,-73.95307,Private room,$53,2,11,42471,0.23,1,0
32030,30084221,"Live in the heart of East Village, Manhatan",44642799,Judith,Manhattan,East Village,40.72818,-73.98075,Entire home/apt,$225,3,2,43468,0.31,1,0
38670,35916489,Sun-drenched LGBT Cozy Private QueenRoom @Bush...,135563904,Michael,Queens,Ridgewood,40.7055,-73.9096,Private room,$180,1,2,43647,2.0,3,78
34207,31898844,Eddies place #2,98214533,Edwin’S Place,Staten Island,Huguenot,40.53982,-74.1728,Entire home/apt,$75,2,20,43653,4.38,2,229
28593,27018851,Bright and Spacious Apartment,32866463,Bassam,Brooklyn,Bay Ridge,40.63328,-74.01978,Entire home/apt,$145,1,41,43639,3.63,2,139


In [8]:
#Return a tuple representing the dimensionality of the DataFrame.
df.shape

(38847, 16)

In [9]:
#This method prints information about a DataFrame including the index dtype and columns, non-null values and memory usage.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38847 entries, 0 to 38846
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              38847 non-null  int64  
 1   NamE                            37133 non-null  object 
 2   host ID                         38847 non-null  int64  
 3   host_name                       38522 non-null  object 
 4   neighbourhood_group             38847 non-null  object 
 5   neighbourhood                   38847 non-null  object 
 6   latitude                        38847 non-null  float64
 7   longitude                       38847 non-null  float64
 8   Room Type                       38847 non-null  object 
 9   price                           38847 non-null  object 
 10  minimum_nights                  38847 non-null  int64  
 11  number_of_reviews               38847 non-null  int64  
 12  LAST review                     

In [10]:
#Return the dtypes in the DataFrame.
df.dtypes

id                                  int64
NamE                               object
host ID                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
Room Type                          object
price                              object
minimum_nights                      int64
number_of_reviews                   int64
LAST review                         int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

## Renaming columns

In [11]:
# The column labels of the DataFrame.
df.columns

Index(['id', 'NamE', 'host ID', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'Room Type', 'price',
       'minimum_nights', 'number_of_reviews', 'LAST review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [12]:
# Turn into list
df.columns.to_list()

['id',
 'NamE',
 'host ID',
 'host_name',
 'neighbourhood_group',
 'neighbourhood',
 'latitude',
 'longitude',
 'Room Type',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'LAST review',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

### String methods

In [13]:
# replace whitespace with underscore '_'
df.columns = df.columns.str.replace(' ', "_")

# lower case column names
df.columns = df.columns.str.lower()

# See the result
df.columns.to_list()

['id',
 'name',
 'host_id',
 'host_name',
 'neighbourhood_group',
 'neighbourhood',
 'latitude',
 'longitude',
 'room_type',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'last_review',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

### Renaming columns

In [14]:
#The rename() method allows you to relabel an axis based on some mapping (a dict or Series) or an arbitrary function.
#Requires a dict with 'old_name':'new_name'
new_names = {'last_review':'last_review_date',
             'calculated_host_listings_count':'listings_count'}

df.rename(columns = new_names).columns 

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review_date',
       'reviews_per_month', 'listings_count', 'availability_365'],
      dtype='object')

In [15]:
# Without inplace=True, not updating dataframe. Only returning output.
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [16]:
#The rename() method also provides an inplace named parameter that is by default False and copies the underlying data. Pass inplace=True to rename the data in place.

df.rename(columns = new_names, inplace = True)
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review_date',
       'reviews_per_month', 'listings_count', 'availability_365'],
      dtype='object')

## Remove junk from values

We will remove dollar sign from the price field

In [17]:
# looking at the data
df['price'].head()

0    $149
1    $225
2     $89
3     $80
4    $200
Name: price, dtype: object

In [18]:
# Removing $ sign
df['price'] = df['price'].str.replace("$", "", regex=True)
df['price'].head()

0    149
1    225
2     89
3     80
4    200
Name: price, dtype: object

In [19]:
# check price column data type
df['price'].dtype

dtype('O')

## Converting data types
[Article about data types](https://pbpython.com/pandas_dtypes.html)

| Pandas dtype    | Python type  | NumPy type                                                      | Usage                                        |
| --------------- | ------------ | --------------------------------------------------------------- | -------------------------------------------- |
| object          | str or mixed | string\_, unicode\_, mixed types                                | Text or mixed numeric and non-numeric values |
| int64           | int          | int\_, int8, int16, int32, int64, uint8, uint16, uint32, uint64 | Integer numbers                              |
| float64         | float        | float\_, float16, float32, float64                              | Floating point numbers                       |
| bool            | bool         | bool\_                                                          | True/False values                            |
| datetime64      | NA           | datetime64\[ns\]                                                | Date and time values                         |
| timedelta\[ns\] | NA           | NA                                                              | Differences between two datetimes            |
| category        | NA           | NA                                                              | Finite list of text values                   |



### Converting to numeric

In [20]:
df['price'] = pd.to_numeric(df['price'])
df['price'].dtype

dtype('int64')

In [21]:
df.sample(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review_date,reviews_per_month,listings_count,availability_365
12089,11343196,Charming Studio in Great Location,5368671,Kevin,Brooklyn,Cobble Hill,40.68963,-73.99259,Entire home/apt,94,1,39,43618,0.98,1,4
20101,19424191,Lovely studio in Midtown Manhattan,120762452,Stanley,Manhattan,Murray Hill,40.74857,-73.97722,Entire home/apt,150,30,5,43519,0.26,50,344
20705,19914209,Luxury Private Bedroom (near JFK),141052003,Nicole,Queens,Jamaica,40.68558,-73.78018,Private room,65,1,8,43369,0.42,4,155
26462,24695406,Spacious Private Basement Apartment,98157128,Michael,Bronx,Wakefield,40.88764,-73.8558,Entire home/apt,100,2,38,43635,2.66,1,107
7237,6322937,,80447,Francesca,Queens,Long Island City,40.7645,-73.93181,Entire home/apt,115,5,6,42659,0.13,1,0


In [22]:
# view dtype of all columns
df.dtypes

id                       int64
name                    object
host_id                  int64
host_name               object
neighbourhood_group     object
neighbourhood           object
latitude               float64
longitude              float64
room_type               object
price                    int64
minimum_nights           int64
number_of_reviews        int64
last_review_date         int64
reviews_per_month      float64
listings_count           int64
availability_365         int64
dtype: object

### Convert to category

- `neighbourhood_group`
- `neighbourhood`
- `room_type`

Guidelines for conversion
- to reduce memory and increase performance with operations related to categorical data
- make sure data is clean before converting it

[More on this](https://pbpython.com/pandas_dtypes_cat.html)

In [23]:
# check unique values in categorical column
df['neighbourhood_group'].unique()

array(['Brooklyn', 'Manhattan', 'Manhatan', 'Broklyn', 'Queens',
       'Staten Island', 'Bronx'], dtype=object)

In [24]:
# show amount of categories in column
len(df['neighbourhood'].unique())

218

In [25]:
df['room_type'].unique()

array(['Private room', 'Entire home/apt', 'Shared room'], dtype=object)

In [26]:
df.memory_usage(deep=True).sum()

17402573

In [27]:
# for categorical data with small amounts of categories, converting them to dtype category can save memory and make some operations more efficient 

df['room_type'] = df['room_type'].astype('category')

In [28]:
df['room_type'].dtype

CategoricalDtype(categories=['Entire home/apt', 'Private room', 'Shared room'], ordered=False)

In [29]:
df['room_type'].unique()

['Private room', 'Entire home/apt', 'Shared room']
Categories (3, object): ['Entire home/apt', 'Private room', 'Shared room']

In [30]:
df.memory_usage(deep=True).sum()

14701138

### Convert to datetime 

In [31]:
df['last_review_date'].sample(10)

25592    43633
10312    43640
13739    42961
2493     43652
20703    42961
34053    43647
7129     43589
8293     43461
4649     43498
8614     42283
Name: last_review_date, dtype: int64

In [32]:
df['last_review_date'] = pd.to_datetime(df['last_review_date'], unit='D', origin='1899-12-30')

Python strftime cheatsheet https://strftime.org/

In [33]:
df.dtypes

id                              int64
name                           object
host_id                         int64
host_name                      object
neighbourhood_group            object
neighbourhood                  object
latitude                      float64
longitude                     float64
room_type                    category
price                           int64
minimum_nights                  int64
number_of_reviews               int64
last_review_date       datetime64[ns]
reviews_per_month             float64
listings_count                  int64
availability_365                int64
dtype: object

In [34]:
df['last_review_date'].head()

0   2018-10-19
1   2019-05-21
2   2019-07-05
3   2018-11-19
4   2019-06-22
Name: last_review_date, dtype: datetime64[ns]

### Changing numeric types

ex: changing int types (int8 | int16 | int32 | int64)

- the numbers stands for bit
- int8 can store integers from -128 to 127.
- int16 can store integers from -32768 to 32767.
- int64 can store integers from -9223372036854775808 to 9223372036854775807.

In [35]:
# get maximum from column
df['minimum_nights'].max()

1250

In [36]:
# get minimum from column
df['minimum_nights'].min()

1

In [37]:
df['minimum_nights'].dtype

dtype('int64')

In [38]:
df.memory_usage(deep=True).sum()

14701138

In [39]:
df['minimum_nights'] = df['minimum_nights'].astype('int16')
df['minimum_nights'].dtype

dtype('int16')

In [40]:
df.memory_usage(deep=True).sum()

14468056

## Missing data

In [41]:
# Checking which columns have missing data
df.isnull().sum()

id                        0
name                   1714
host_id                   0
host_name               325
neighbourhood_group       0
neighbourhood             0
latitude                  0
longitude                 0
room_type                 0
price                     0
minimum_nights            0
number_of_reviews         0
last_review_date          0
reviews_per_month        35
listings_count            0
availability_365          0
dtype: int64

### percentage of missing data

In [42]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
id,id,0.0
name,name,4.412181
host_id,host_id,0.0
host_name,host_name,0.836615
neighbourhood_group,neighbourhood_group,0.0
neighbourhood,neighbourhood,0.0
latitude,latitude,0.0
longitude,longitude,0.0
room_type,room_type,0.0
price,price,0.0


### How to deal with missing data?

Dealing with missing data is not simple task, you have need to consider why the data is missing in the first place, and domain knowledge to know what to impute. 

There also isn't a specific threshold for what percentage of missing data is accepted, it depends on the data.

If you mess it up, you will introduce bias to your data.  


#### Techniques 
1. Drop feature 
1. Drop the rows
1. Impute missing values (manually or automatically)

Read this [article](https://towardsdatascience.com/how-to-handle-missing-data-8646b18db0d4) to go deeper into this topic

In [43]:
# copying original data set is good practice when you're experimenting with things

df_ori = df.copy()

#### Dropping columns with missing values

- usually worst strategy unless it has a lot of missing data (over 80 or 90%), or feature is not useful

In [44]:
# Create list of columns you want to drop
colsToDrop = ['name','host_name','reviews_per_month']

# axis = 1 / 'columns' stands for column
# axis = 0 / 'index' stands for row
df.drop(colsToDrop, axis = 'columns', inplace = True)

In [45]:
df.isnull().sum()

id                     0
host_id                0
neighbourhood_group    0
neighbourhood          0
latitude               0
longitude              0
room_type              0
price                  0
minimum_nights         0
number_of_reviews      0
last_review_date       0
listings_count         0
availability_365       0
dtype: int64

In [46]:
print(df.shape)

(38847, 13)


#### Remove rows with missing values
- Losing even more information (from other columns), so not the best method

In [47]:
# restoring original dataset
df = df_ori.copy()

# before .dropna
print(df.shape) 

# after .dropna
print(df.dropna().shape)

(38847, 16)
(36781, 16)


#### Impute missing values


- A constant value related to the data, such as 0 for `reviews_per_month`, or "None" for `name`.
- The value before or after the data point (backward fill, forward fill)
- Summary statistics such as mean, median or mode value for the column.
- A value estimated by algorithms or ML models like KNN.

More detailed imputation techniques in this [article](https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779)  

In [48]:
df.head()

0     NaN
1    0.38
2     NaN
3     NaN
4    0.59
Name: reviews_per_month, dtype: float64

#### Mean of reviews_per_month

In [51]:
# restoring original dataset
df = df_ori.copy()

# calculating the mean
reviews_mean = df['reviews_per_month'].mean()
print(reviews_mean)

# filling missing values with mean
df = df.fillna(value = {'reviews_per_month' : reviews_mean})
df['reviews_per_month'].head()

1.3736321240853344


0    1.373632
1    0.380000
2    1.373632
3    1.373632
4    0.590000
Name: reviews_per_month, dtype: float64

#### median of reviews_per_month

In [56]:
# restoring original dataset
df = df_ori.copy()

# calculating the median
reviews_median = df['reviews_per_month'].median()
print(reviews_median)

# filling missing values with median
# df.fillna(value = {'reviews_per_month' : reviews_median}).head()['reviews_per_month']

df = df.fillna(value = {'reviews_per_month' : reviews_median})
df['reviews_per_month'].head()

0.72


0    0.72
1    0.38
2    0.72
3    0.72
4    0.59
Name: reviews_per_month, dtype: float64

In [58]:
# restoring original dataset
df = df_ori.copy()

##### Imputing with bfill and ffill

bfill
- bfill stands for backwards fill
- means filling in missing value with value after it (fill it backwards)

ffill
- ffill stands for forward fill
- means filling in missing value with value before it (fill it forwards)


In [59]:
df['reviews_per_month'].head()

0     NaN
1    0.38
2     NaN
3     NaN
4    0.59
Name: reviews_per_month, dtype: float64

In [60]:
# imputing with bfill
df['reviews_per_month'].bfill().head()

0    0.38
1    0.38
2    0.59
3    0.59
4    0.59
Name: reviews_per_month, dtype: float64

In [61]:
# imputing with ffill
df['reviews_per_month'].ffill().head()

0     NaN
1    0.38
2    0.38
3    0.38
4    0.59
Name: reviews_per_month, dtype: float64

In [63]:
# restoring original dataset
df = df_ori.copy()

In [None]:
##### Imputing manually with pandas fillna

In [64]:
# We will deal with missing values manually
missing_val = { 'reviews_per_month': 0,
                'name':'None',
                'host_name':'None'}

df = df.fillna(value = missing_val)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review_date,reviews_per_month,listings_count,availability_365
0,2539,,2787,,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.0,6,365
1,2595,Skylit Midtown Castle,2845,,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3831,,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,0.0,1,194
3,5022,,7192,Laura,Manhatan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.0,1,0
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129


## Split Date column

In [65]:
df['last_review_date'].head()

0   2018-10-19
1   2019-05-21
2   2019-07-05
3   2018-11-19
4   2019-06-22
Name: last_review_date, dtype: datetime64[ns]

In [66]:
# Int64 are pandas data types that can handle missing values
df['year'] = df['last_review_date'].dt.year.astype('Int64')
df['month'] = df['last_review_date'].dt.month.astype('Int64')
df['day'] = df['last_review_date'].dt.day.astype('Int64')

In [67]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review_date,reviews_per_month,listings_count,availability_365,year,month,day
0,2539,,2787,,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.0,6,365,2018,10,19
1,2595,Skylit Midtown Castle,2845,,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,2019,5,21
2,3831,,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,0.0,1,194,2019,7,5
3,5022,,7192,Laura,Manhatan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.0,1,0,2018,11,19
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,2019,6,22


## Data Inconsistencies

#### Are the values in the column `availability_365` within 365 ?

In [68]:
# show statistics of a column
df['availability_365'].describe()

count    38847.000000
mean       116.172858
std        135.411287
min         -1.000000
25%          0.000000
50%         55.000000
75%        229.000000
max       5205.000000
Name: availability_365, dtype: float64

In [70]:
print('There are {} values greater then 365.'.format(df['availability_365'][df['availability_365'] > 365].count()))
print('There are {} values lesser then 0.'.format(df['availability_365'][df['availability_365'] < 0].count()))


There are 155 values greater then 365.
There are 9 values lesser then 0.


In [71]:
df.loc[df['availability_365'] > 365, 'availability_365'] = 365
df.loc[df['availability_365'] < 0, 'availability_365'] = 0
df['availability_365'].describe()

count    38847.000000
mean       115.091050
std        129.921701
min          0.000000
25%          0.000000
50%         55.000000
75%        229.000000
max        365.000000
Name: availability_365, dtype: float64

### Categorical data inconsistency

In [72]:
# show all neighbourhood_groups
df['neighbourhood_group'].unique()

array(['Brooklyn', 'Manhattan', 'Manhatan', 'Broklyn', 'Queens',
       'Staten Island', 'Bronx'], dtype=object)

In [73]:
wrong = ['Brokln', 'Manhatan']
right = ['Brooklyn', 'Manhattan']

# replace the wrong spelling with right spelling
df['neighbourhood_group'].replace(to_replace = wrong, 
                                  value = right, 
                                  inplace = True)

# changing values removes the category date type
df['neighbourhood_group'].unique()

array(['Brooklyn', 'Manhattan', 'Broklyn', 'Queens', 'Staten Island',
       'Bronx'], dtype=object)

This was a simple example with only 6 categories. In real life scenarios there could be hundreds of categories. To deal with that you can use these techniques.

1. Preprocess the text (lowercase, strip whitespace)
1. Use fuzzy matching to find similar words, and replace them. ([example](https://www.kaggle.com/rtatman/data-cleaning-challenge-inconsistent-data-entry/?scriptVersionId=3012975&cellId=14))

### Duplicate rows

In [74]:
# Check if there any duplicate rows?
df.duplicated().any()

True

In [75]:
# How many rows are duplicated?
df.duplicated().sum()

4

In [76]:
# Print duplicated rows
df[df.duplicated()]

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review_date,reviews_per_month,listings_count,availability_365,year,month,day
11101,9920941,Jewel On Parkside,11546043,Liz And Cesar,Brooklyn,Prospect-Lefferts Gardens,40.65586,-73.95934,Private room,65,28,11,2019-06-01,0.26,3,320,2019,6,1
11108,9924665,Private Room in Greenpoint,26714887,Anthony,Brooklyn,Greenpoint,40.73036,-73.95453,Private room,60,1,3,2016-08-23,0.07,2,0,2016,8,23
11129,9935992,Lux Studio Apt in Downtown NYC - Gym included!,30283594,Kara,Manhattan,Financial District,40.70749,-74.01413,Entire home/apt,169,30,2,2017-11-17,0.06,121,364,2017,11,17
11280,10035098,Upper West ... Museums/Parks/Subway,51513795,Doug,Manhattan,Upper West Side,40.77845,-73.97985,Entire home/apt,175,2,1,2016-01-03,0.02,1,0,2016,1,3


In [77]:
# Remove duplicated rows
df.drop_duplicates(inplace = True)

In [78]:
# Check again if there any duplicate rows?
df.duplicated().any()

False

## Outliers

> You should be very carefull handling outliers and have a good knowledge about what the data represents and information about where the data come from and what they mean ; it also depends on what analysis one is planning to perform.

Some articles about outliers:
- https://www.freecodecamp.org/news/what-is-an-outlier-definition-and-how-to-find-outliers-in-statistics/
- https://www.pluralsight.com/guides/cleaning-up-data-from-outliers

In [None]:
# Let's look at the price field
df['price'].describe()

In [None]:
# Boxplot could help us to visualise outliers
df.boxplot(column=['price'],figsize=(5,10));
plt.show()

In [None]:
# Let's say we decided to treat everything above 99 percentile and everything lower 1 percentile as outliers
small = df['price'].quantile(0.01)
big = df['price'].quantile(0.99)
print(small)
print(big)

In [None]:
# Trimming the outliers
index = df[(df['price'] >= big)|(df['price'] <= small)].index
df.drop(index, inplace=True)

In [None]:
# Let's plot again to see the result
df.boxplot(column=['price'],figsize=(5,10));
plt.show()

## Save final cleaned dataset

In [None]:
df.to_csv("clean_data")