<h1>I. Import libraries </h1>

In [None]:
import pandas as pd
import warnings
from matplotlib import pyplot as plt
warnings.filterwarnings("ignore")

<h1> II. Define functions</h1>

Define the function to draw horizontal barplot

In [None]:
def draw_barplot_horizontal(name,value,fig_width=16,fig_height=20,title=''):
    fig, ax = plt.subplots(figsize =(fig_width, fig_height))

    # Horizontal Bar Plot
    ax.barh(name, value)

    # Remove axes splines
    for s in ['top', 'bottom', 'left', 'right']:
        ax.spines[s].set_visible(False)

    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')

    # Add padding between axes and labels
    ax.xaxis.set_tick_params(pad = 5)
    ax.yaxis.set_tick_params(pad = 10)

    # Add x, y gridlines
    ax.grid(b = True, color ='grey',
            linestyle ='-.', linewidth = 0.5,
            alpha = 0.2)

    # Show top values
    ax.invert_yaxis()

    # Add annotation to bars
    for i in ax.patches:
        plt.text(i.get_width()+0.2, i.get_y()+0.5,
                 str(round((i.get_width()), 2)),
                 fontsize = 10, fontweight ='bold',
                 color ='grey')


    ax.set_title(title,
                 loc ='left', )

    # Add Text watermark
    fig.text(0.9, 0.15,'MiBA_A10', fontsize = 12,
             color ='grey', ha ='right', va ='bottom',
             alpha = 0.7)

    # Show Plot
    plt.show()

<h1>III. Import dataset</h1>

Import dataset and have a basic look

In [None]:
listings = pd.DataFrame(pd.read_csv("./Dataset/listings.csv"))
listings.head(5)

<h1>IV. Data cleaning</h1>

In [None]:
listings.columns

In [None]:
keep_var = ['id', 'name', 'description',
       'neighborhood_overview', 
       'host_is_superhost','neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights',   'number_of_reviews','availability_365',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'reviews_per_month']
listings_cleaned = listings.loc[:,keep_var]
listings_cleaned.shape

To see whether we have dupulicated rows

In [None]:
listings_cleaned.duplicated().sum()

To check missing variables

In [None]:
listings_cleaned.isnull().sum() 

In [None]:
listings_cleaned.dropna(subset=['name','bathrooms_text', 'bedrooms', 'beds'], inplace=True)

In [None]:
listings_cleaned['neighborhood_overview'].fillna(value='', inplace=True)
listings_cleaned['neighbourhood'].fillna(value='', inplace=True)
listings_cleaned['description'].fillna(value='', inplace=True)

In [None]:
listings_cleaned['host_is_superhost'].fillna(value='f', inplace=True)

In [None]:
listings_cleaned['price']

In [None]:
listings_cleaned['price'] = listings_cleaned['price'].str.replace('$', '').str.replace(',', '').astype(float)
listings_cleaned['price']

In [None]:
# listings_cleaned['bathrooms'] = listings_cleaned['bathrooms_text'].str.replace('$', '').str.replace(',', '').astype(float)
listings_cleaned['bathrooms_text'].str.extract('([0-9][.]*[0-9]*)')


In [None]:
listings_cleaned['price'].describe()

In [None]:
red_square = dict(markerfacecolor='r', markeredgecolor='r', marker='.')
listings_cleaned['price'].plot(kind='box', xlim=(0, 1000), vert=False, flierprops=red_square, figsize=(16,2));

In [None]:
neighbourhood_cleansed = listings_cleaned.groupby(by = 'neighbourhood_cleansed').mean()['price'].sort_values(ascending=False).to_dict()

In [None]:
listings_cleaned.drop(listings_cleaned[ (listings_cleaned['price'] > 400) | (listings_cleaned['price'] == 0) ].index, axis=0, inplace=True)


In [None]:
draw_barplot_horizontal(list(neighbourhood_cleansed.keys()), 
                        list(neighbourhood_cleansed.values()),
                        title='Appartment average price in different Neighbourhood')

In [None]:
listings_cleaned['property_type'].value_counts()

In [None]:
listings_cleaned.drop(listings_cleaned[ (listings_cleaned['property_type'] != 'Entire rental unit') & (listings_cleaned['property_type'] != 'Private room in rental unit') ].index, axis=0, inplace=True)

In [None]:
listings_cleaned['size'] = listings_cleaned['description'].str.extract('(\d{2,3}\s?[smSM])', expand=True)
listings_cleaned['size'] = listings_cleaned['size'].str.replace("\D", "")
listings_cleaned['size']

In [None]:
listings_cleaned.isnull().sum() 

<h1>V. Data engineering</h1>

In [None]:
listings_cleaned_size = listings_cleaned_size.loc[:,'bedrooms','beds','accommodates','size','property_type']