# Applying Feature Engineering for AirBNB data:

In [1]:
import pandas as pd
df = pd.read_csv("airbnb prices.csv")
print(df.head())

    room_id  survey_id   host_id    room_type  country       city  borough  \
0  10176931       1476  49180562  Shared room      NaN  Amsterdam      NaN   
1   8935871       1476  46718394  Shared room      NaN  Amsterdam      NaN   
2  14011697       1476  10346595  Shared room      NaN  Amsterdam      NaN   
3   6137978       1476   8685430  Shared room      NaN  Amsterdam      NaN   
4  18630616       1476  70191803  Shared room      NaN  Amsterdam      NaN   

              neighborhood  reviews  overall_satisfaction  accommodates  \
0  De Pijp / Rivierenbuurt        7                   4.5             2   
1             Centrum West       45                   4.5             4   
2          Watergraafsmeer        1                   0.0             3   
3             Centrum West        7                   5.0             4   
4   De Baarsjes / Oud West        1                   0.0             2   

   bedrooms  bathrooms  price  minstay  \
0       1.0        NaN  156.0      NaN

In [2]:
df.shape

(18723, 20)

### Shape: 20 columns and 18723 rows

In [3]:
df.isnull().sum()

room_id                     0
survey_id                   0
host_id                     0
room_type                   0
country                 18723
city                        0
borough                 18723
neighborhood                0
reviews                     0
overall_satisfaction        0
accommodates                0
bedrooms                    0
bathrooms               18723
price                       0
minstay                 18723
name                       52
last_modified               0
latitude                    0
longitude                   0
location                    0
dtype: int64

### from above observation, we can see that country, borough, bathrooms and minstay columns have complete missing (nan) values therefore i'll simply have to drop those colummns

In [4]:
df.drop(['country','borough','bathrooms','minstay'], axis = 1, inplace = True)
print(df.head())

    room_id  survey_id   host_id    room_type       city  \
0  10176931       1476  49180562  Shared room  Amsterdam   
1   8935871       1476  46718394  Shared room  Amsterdam   
2  14011697       1476  10346595  Shared room  Amsterdam   
3   6137978       1476   8685430  Shared room  Amsterdam   
4  18630616       1476  70191803  Shared room  Amsterdam   

              neighborhood  reviews  overall_satisfaction  accommodates  \
0  De Pijp / Rivierenbuurt        7                   4.5             2   
1             Centrum West       45                   4.5             4   
2          Watergraafsmeer        1                   0.0             3   
3             Centrum West        7                   5.0             4   
4   De Baarsjes / Oud West        1                   0.0             2   

   bedrooms  price                                            name  \
0       1.0  156.0        Red Light/ Canal view apartment (Shared)   
1       1.0  126.0  Sunny and Cozy Living room i

In [5]:
df.shape

(18723, 16)

### Shape: 16 columns and 18723 rows

In [6]:
df.isnull().sum()

room_id                  0
survey_id                0
host_id                  0
room_type                0
city                     0
neighborhood             0
reviews                  0
overall_satisfaction     0
accommodates             0
bedrooms                 0
price                    0
name                    52
last_modified            0
latitude                 0
longitude                0
location                 0
dtype: int64

### Here we can observe that Name variable has still 52 empty (nan) value

# Handling (Droping) missing values in Name variable:

In [7]:
df = df.dropna()
print(df.head())

    room_id  survey_id   host_id    room_type       city  \
0  10176931       1476  49180562  Shared room  Amsterdam   
1   8935871       1476  46718394  Shared room  Amsterdam   
2  14011697       1476  10346595  Shared room  Amsterdam   
3   6137978       1476   8685430  Shared room  Amsterdam   
4  18630616       1476  70191803  Shared room  Amsterdam   

              neighborhood  reviews  overall_satisfaction  accommodates  \
0  De Pijp / Rivierenbuurt        7                   4.5             2   
1             Centrum West       45                   4.5             4   
2          Watergraafsmeer        1                   0.0             3   
3             Centrum West        7                   5.0             4   
4   De Baarsjes / Oud West        1                   0.0             2   

   bedrooms  price                                            name  \
0       1.0  156.0        Red Light/ Canal view apartment (Shared)   
1       1.0  126.0  Sunny and Cozy Living room i

In [8]:
df.shape

(18671, 16)

### Shape: 16 columns and 18671 rows

In [9]:
df.isnull().sum()

room_id                 0
survey_id               0
host_id                 0
room_type               0
city                    0
neighborhood            0
reviews                 0
overall_satisfaction    0
accommodates            0
bedrooms                0
price                   0
name                    0
last_modified           0
latitude                0
longitude               0
location                0
dtype: int64

# Now we have completely processed our raw data which now contains zero missing values. Therefore now I can convert it into csv file which can be used for analysis using any BI tool (Tableau / Power BI)

### converting df to csv file

In [10]:
df.to_csv('AirBNB.csv', index = False)