In [None]:
import pandas as pd
import datetime
from re import sub
from decimal import Decimal

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')

from langdetect import detect

from sklearn import preprocessing

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

# Change pandas viewing options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Importing Data 

In [None]:
df_listings = pd.read_csv("../../data/new-york-city-airbnb-open-data/listings.csv")
df_neighborhoods = pd.read_csv("../../data/new-york-city-airbnb-open-data/neighbourhoods.csv")
df_reviews = pd.read_csv("../../data/new-york-city-airbnb-open-data/reviews.csv")
df_calendar = pd.read_csv("../../data/new-york-city-airbnb-open-data/calendar.csv")
df_listings.rename(columns = {'id':'listing_id'}, inplace = True) 

# Exploring Listings Dataframe

In [None]:
# df_listings.shape
# list(df_listings.columns)
# df_listings.describe()
# df_listings.dtypes
# df_listings.head()
# list(df_listings.columns)
df_listings.last_scraped.value_counts()
# df_listings.host_response_time

## Dropping columns with majority NaN values

In [None]:
percent_missing = df_listings.isna().sum() * 100 / len(df_listings)
missing_value_df = pd.DataFrame({'column_name': df_listings.columns,
                                 'percent_missing': percent_missing.astype('int64')}).reset_index()

In [None]:
# exploring missing values percentage 
# missing_value_df.iloc[21:40,:]
missing_value_df['percent_missing'].value_counts()

In [None]:
# creating a list of columns to be dropped. In this case all of those with more than 30% of missing values
columns_with_nulls_drop = list(missing_value_df[missing_value_df['percent_missing']>50]['column_name'])

In [None]:
columns_with_nulls_drop

In [None]:
# dropping columns with X% of missing values 
df_listings.drop(columns_with_nulls_drop, axis=1, inplace=True)

## Encoding binary values

In [None]:
# Encoding true and false values
df_listings.replace({'f': 0, 't': 1}, inplace=True)

In [None]:
# df_listings.head()

## Transform dates to datetime

In [None]:
df_listings.loc[:,['last_scraped','host_since',
                 'calendar_last_scraped','first_review','last_review']] = df_listings.loc[:,['last_scraped','host_since',
                 'calendar_last_scraped','first_review','last_review']].apply(pd.to_datetime, errors='coerce')

In [None]:
# df_listings.iloc[0:20,:].dtypes
df_listings.listing_url.head()

In [None]:
# df_listings.dtypes

## Separating columns for NLP

In [None]:
df_listings_for_NLP = df_listings.select_dtypes(include=['object'])
df_listings_non_text = df_listings.select_dtypes(exclude=['object'])

## Working with numerical data

host_listings_count and host_total_listings_count are always the same except in 5 cases where they are NaN. Therefore
those columns will be dropped

In [None]:
print(sum((df_listings_non_text['host_listings_count'] 
           == df_listings_non_text['host_total_listings_count']) 
          == False))

df_listings_non_text.loc[((df_listings_non_text.host_listings_count 
                           == df_listings_non_text.host_total_listings_count) 
                          == False)]

In [None]:
df_listings_non_text.drop(['host_listings_count','host_total_listings_count'], axis=1, inplace=True)

In [None]:
# Plotting the distribution of numerical and boolean categories
df_listings_non_text.hist(figsize=(20,20));

In [None]:
# drop columns with a single category
df_listings_non_text.drop(['has_availability', 'host_has_profile_pic', 
                      'is_business_travel_ready', 'require_guest_phone_verification', 
                      'require_guest_profile_picture', 'requires_license'], axis=1, inplace=True)

## Cleaning individual columns

**host_since**

In [None]:
# df_listings_non_text.host_since.value_counts()
df_listings_non_text.host_since.head()

# Calculating the number of days a host has been active to scraping date
df_listings_non_text['host_days_active'] = df_listings_non_text.loc[:,'host_since'].apply(lambda x: 
                                                                                          datetime.datetime(
                                                                                              2020, 3, 14) - x)

# Printing mean and median
print("Mean days as host:", df_listings_non_text['host_days_active'].mean().days)
print("Median days as host:", df_listings_non_text['host_days_active'].median().days)

# Replacing null values with the median
df_listings_non_text.host_days_active.fillna(df_listings_non_text.host_days_active.median().days, inplace=True)

***

# NLP Analysis

## Sentiment Analysis of review comments

In [None]:
# df_reviews.shape
df_reviews.head()

In [None]:
# calculates compound sentiment polarity of the sentence
vader_polarity_compound = lambda x: (SentimentIntensityAnalyzer().polarity_scores(x))['compound']

# We can retrieve scores for positive, negative or neutral sentiment. 
# We will use the compound: a normalized value: norm_score = score / math.sqrt((score * score) + alpha)
print(SentimentIntensityAnalyzer().polarity_scores('VADER is smart, handsome, and funny.'))

In [None]:
# determining the number of no comments
"{}% of reviews have empty comments".format((df_reviews.comments.isnull().sum() / df_reviews.comments.shape[0]) *100)
# "My name is {}, I'am {}".format("John",36)

In [None]:
# Since the percentage is so insignificant, these rows will be dropped
df_reviews.dropna(subset=['comments'], how='any', axis=0, inplace=True)

Using the pre-trained Vader sentiment model based on NLTK go create polarity scores for all reviews:

In [None]:
df_reviews['polarity'] = df_reviews.comments.map(vader_polarity_compound)

Function that predicts the language. It needs to be passed a string with decent amount of characters, thus the calculation on the fly of the lenght of the string passed.

In [None]:
def predict_lang(x):
    lang=''
    txt_len=len(x)
    if txt_len>100:
        try:
            lang=detect(x)
        except Exception as e:
            lang=''
    return lang

In [None]:
df_reviews['review_lang'] = df_reviews.comments.apply(lambda x: predict_lang(x))

We observe that a lot of reviews didnt get a language value. This is because their length was too short for the calculation

In [None]:
# df_reviews.review_lang.value_counts()

Reviews were exported for both English and Spanish. I checked the spanish reviews fo sentiment and wasn't accurate what leads me to believe that this algorithm works best for English text

In [None]:
df_reviews[df_reviews.review_lang == 'en'].to_csv('reviews_with_sentiment_en.csv', sep='\t', index=False)

## Cleaning individual columns

**host_response_time**

In [None]:
df_listings_for_NLP.host_response_time.value_counts(), df_listings_for_NLP.shape

In [None]:
print("Null values in host response time:", df_listings_for_NLP.host_response_time.isna().sum())
print(f"Proportion to the hosts WITHOUT response time: {round((df_listings_for_NLP.host_response_time.isna().sum()/len(df_listings_for_NLP))*100, 1)}%")

**Is there a connection between hosts not having response time and the reviews they get in terms of communication**?

In [None]:
# Number of rows without a value for host_response_time which have also not yet had a review
df_host_response_review = pd.concat([df_listings_for_NLP['host_response_time'],
                                     df_listings_non_text['first_review']], axis=1)
print("Hosts with no response time that don't have a review yet:",
      len(df_host_response_review[df_host_response_review.loc[ :,
                                                              ['host_response_time ','first_review']
                                                             ].isnull().sum(axis=1) == 2]))

print("Proportion to total hosts with no response time:", df_host_response_review.host_response_time.isna().sum())

In [None]:
# df_listings_non_text.review_scores_communication.value_counts()
# df_listings_non_text.review_scores_communication.isna().sum()

tmp = df_listings

In [None]:
tmp=tmp.loc[:,['host_response_time','first_review','review_scores_communication']]

In [None]:
tmp.host_response_time.hist();

**Is there a correlation between response rate an a positive review?**

In [None]:
# calendar_last_scraped                           datetime64[ns]
# number_of_reviews                                        int64
# number_of_reviews_ltm                                    int64
# first_review                                            object
# last_review                                     datetime64[ns]
# review_scores_rating                                   float64
# review_scores_accuracy                                 float64
# review_scores_cleanliness                              float64
# review_scores_checkin                                  float64
# review_scores_communication                            float64
# review_scores_location                                 float64
# review_scores_value                                    float64

**Is there correlation between starring and host response rate?**

In [None]:
# Normalizing the data
df_listings_for_NLP.host_response_time.fillna("unknown", inplace=True)
df_listings_for_NLP.host_response_time.value_counts(normalize=True)

***

## Count the number of unique listings in calendar and listings DF

In [None]:
df_calendar['listing_id'].nunique()

## Joining calendar with listings. I want to know data from what years are available

In [None]:
# df_listings_date = pd.merge(df_listings, df_calendar, how='inner', on=['listing_id', 'listing_id'])

In [None]:
df_calendar['date'] = pd.to_datetime(df_calendar['date'])
df_calendar ['year'] = pd.DatetimeIndex(df_calendar['date']).year

## Years of listings available

In [None]:
pd.Categorical(df_calendar['year'])

## Creating season label

In [None]:
df_calendar['season'] = (pd.DatetimeIndex(df_calendar['date']).month%12 + 3)//3
df_calendar['season_l'] = pd.cut(df_calendar['season'], 4, labels=["winter", "spring", "summer","autum"])

In [None]:
# df_calendar[df_calendar['season']==1]
df_calendar.head()
# df_calendar.shape

## Understanding the difference between adjusted price and price

In [None]:
# example = [sub(r'[^\d.]', '', r['price']) for i, r in df_calendar_temp.iterrows()]
df_calendar['price'] = [float(sub(r'[^\d.]', '', r['price'])) for i, r in df_calendar.iterrows()]

In [None]:
df_calendar['adjusted_price'] = [float(sub(r'[^\d.]', '', r['adjusted_price'])) for i, r in df_calendar.iterrows()]

In [None]:
df_calendar['difference_price_adjusted'] = df_calendar['price'] - df_calendar['adjusted_price'] 

In [None]:
# df_calendar[df_calendar['difference_price_adjusted']<0]
# conclusion: there is a difference between adjusted price and actual price. We need to further investigate

## Which areas of have the most Airbnb properties, and which are the most expensive?

In [None]:
df_calendar[df_calendar['difference_price_adjusted']<0]