In [None]:
%run data_profiling.ipynb
%run data_transformation.ipynb
%run package_import.ipynb

# Read data

Kaggle (https://www.kaggle.com/airbnb/seattle/data)

***Content***

The following Airbnb activity is included in this Seattle dataset:

***Listings***, including full descriptions and average review score

***Reviews***, including unique id for each reviewer and detailed comments

***Calendar***, including listing id and the price and availability for that day

## Read calendar.csv

In [None]:
df_calendar = pd.read_csv('Seattle_Airbnb/calendar.csv')

## Data profiling

### Rows and columns

* Number of rows are 1,393,570
* Number of columns are 4

In [None]:
populate_rows_columns(df_calendar)

### Data quality check

* Only the column "price" has null values, accounting for 33%. Rows with Null values in column "price" are to be deleted.

In [None]:
#convert the column price from string to float
df_calendar['price'] = transform_prices_column(df_calendar,'price')

In [None]:
#convert the column price from string to float
try:
    # remove "$" and "," before converting the column "price" from string into float
    df_calendar['date'] = pd.to_datetime(df_calendar['date'])
#     df_calendar['date'] =  df_calendar['date'].dt.date

except: pass

In [None]:
main_data_profiling(df_calendar)

### Check outliers

* Number of rows with price less than the minimum outlier value (-52.5): 0 rows
* Number of rows with price greater than the maximum outlier value (287.5): 66,000 rows

In [None]:
populate_outlier(df_calendar, 'price')

* boxplot showing outliers

In [None]:
df_calendar.boxplot(column=['price'])

#### check prices identified as outliers to see whether they are actual outliers or not

* check how many rows with outliers vs no outliers

In [None]:
# df_outliers = populate_outlier(df_calendar, 'price')
# outlier_max_value = df_outliers.iloc[1,0]
# df_calendar['date'] = pd.to_datetime(df_calendar['date'])
# df_calendar['month']= df_calendar['date'].dt.month

# df_calendar['outlier_flag'] = df_calendar['price'].apply(lambda x: 1 if pd.notna(x) and x>outlier_max_value else 0)

# # monthly_price = df_calendar[(df_calendar['price'].notna())].groupby(by=['date'], as_index=False)['price'].count()
# daily_price = df_calendar[(df_calendar['price'].notna())].groupby(by=['date'], as_index=False).agg({'outlier_flag': ['count', 'sum']})
# daily_price.columns = ['date', 'rows', 'outliers']
# daily_price['%_outliers'] = daily_price['outliers']/daily_price['rows']

In [None]:
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots(figsize=(25,10))
# daily_price.plot.bar(x='date', y='%_outliers', ax=ax)

In [None]:
# x = daily_price['rows']
# y = daily_price['outliers']
# text = daily_price['date'].dt.date

# fig, ax = plt.subplots(figsize=(25,20))
# ax.scatter(x, y)
# plt.title('Total number of prices vs outliers')
# plt.xlabel('# prices ')
# plt.ylabel('# outliers')
# for i, txt in enumerate(text):
#     ax.annotate(txt, (x[i],y[i]))

### Check Duplication

* There are no duplicated rows found for all columns

In [None]:
check_duplication(df_calendar)

## Read listings.csv

In [None]:
df_listings = pd.read_csv(r'Seattle_Airbnb/listings.csv')

In [None]:
try:
    df_listings['experiences_offered'] = df_listings['experiences_offered'].replace("none", np.nan)
    
except:
    pass
    

In [None]:
df_listings['price'] = transform_prices_column(df_listings,'price')
df_listings['weekly_price'] = transform_prices_column(df_listings,'weekly_price')
df_listings['monthly_price'] = transform_prices_column(df_listings,'monthly_price')
df_listings['security_deposit'] = transform_prices_column(df_listings,'security_deposit')
df_listings['cleaning_fee'] = transform_prices_column(df_listings,'cleaning_fee')
df_listings['extra_people'] = transform_prices_column(df_listings,'extra_people')

### Data profiling

* ***Null values***: the column "license" and "experiences_offered" are 100% null values; the column "square_feet" has 97% null values. For the columns with over 97% null values, they need to be removed in ETL
* ***only one unique value***: ['scrape_id', 'last_scraped', 'experiences_offered', 'market', 'country_code', 'country', 'has_availability', 'calendar_last_scraped',
 'requires_license',
 'license',
 'jurisdiction_names']. They might be not useful and need to be removed in ETL process

In [None]:
df_listings_profiling = main_data_profiling(df_listings)

In [None]:
#print data profiling for all columns
df_listings_profiling.sort_values(by=['column_names'])

#### null value check

In [None]:
# list 100% null values
list(df_listings_profiling.loc[df_listings_profiling['%_null_values'] == 1, 'column_names'])

#### only one unique value check

In [None]:
# list columns with only one unique value
list(df_listings_profiling[df_listings_profiling['unique_count'] == 1].columns)

#### use boxplot to check the columns with outliers

In [None]:
outliers_col = list(df_listings_profiling.loc[(df_listings_profiling['num_lower_outliers'].notna()) & (df_listings_profiling['num_higher_outliers'].notna()) & ( (df_listings_profiling['num_lower_outliers']>0) | (df_listings_profiling['num_higher_outliers']>0) ), 'column_names'])
boxplot_outliers(df_listings, outliers_col)

In [None]:
check_duplication(df_listings)

## Read reviews.csv

In [None]:
df_reviews = pd.read_csv(r'Seattle_Airbnb/reviews.csv')

In [None]:
populate_rows_columns(df_reviews)

In [None]:
df_reviews['date'] = pd.to_datetime(df_reviews['date'])

In [None]:
main_data_profiling(df_reviews)

In [None]:
check_duplication(df_reviews)