In [230]:
import pandas as pd
import sklearn as skl
from sklearn import feature_selection
from load_data import load_data
pd.set_option('display.max_columns', None)
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

ModuleNotFoundError: No module named 'statsmodels'

In [195]:
price, listings, reviews = load_data()

In [204]:
c = ["host_location", "host_is_superhost", "host_identity_verified", "neighbourhood_cleansed", "bathrooms_text", "has_availability", "instant_bookable"]
list_cat = listings.filter(c)

# Inspect columns

First we can have a look at the amount of NAs per feature

In [205]:
list_cat.isna().sum()

host_location             32
host_is_superhost         11
host_identity_verified    11
neighbourhood_cleansed     0
bathrooms_text             6
has_availability           0
instant_bookable           0
dtype: int64

There are only a few missing values for the host_location, host_is_superhost and host_identity_verfied. We will just drop them for the moment.

0       False
1        True
2       False
3       False
4       False
        ...  
6971    False
6972    False
6973    False
6974    False
6975    False
Length: 6976, dtype: bool

In [208]:
na_filter = list_cat.isna().any(axis = 1)
price = price[~na_filter]
list_cat = list_cat.dropna()
list_cat.isna().sum()


host_location             0
host_is_superhost         0
host_identity_verified    0
neighbourhood_cleansed    0
bathrooms_text            0
has_availability          0
instant_bookable          0
dtype: int64

## Lets have a look at each variable

### host_location

In [209]:
pd.unique(list_cat["host_location"])

array(['Dublin  Ireland', 'Dublin, County Dublin, Ireland',
       'Tralee, County Kerry, Ireland', 'Dublin, Dublin, Ireland',
       'Lucan, Co.Dublin. Ireland', 'County Kildare, Ireland',
       'County Dublin, Ireland', 'Salvador Brazil, and Dublin Ireland',
       'Lucan, County Dublin, Ireland', 'Istanbul, İstanbul, Turkey',
       'Ireland', 'Singapore, Singapore', 'Rathfarnham, Dublin, Ireland',
       'Trim, County Meath, Ireland', 'Dublin, Leinster, Ireland',
       'Dublin 2, County Dublin, Ireland', 'Smithfield, Dublin, Ireland',
       'ireland', 'Dublin 8, County Dublin, Ireland',
       'Sallynoggin, County Dublin, Ireland',
       'Luxembourg, Luxembourg District, Luxembourg',
       'New York, New York, United States', 'London',
       'Dublin 6, County Dublin, Ireland', 'Cupramontana, Marche, Italy',
       'Ranelagh, Dublin, Ireland',
       'Tübingen, Baden-Württemberg, Germany', 'Toronto, Ontario, Canada',
       'Glasnevin, Dublin, Ireland', 'County Cork, Ireland',

There are a lot of different entries for Dublin. Might be reasonable to store them in one category.
Therefore turn all values containing Dublin into one category "Dublin, Ireland"

In [212]:
fil = list_cat["host_location"].str.contains("Dublin|DB", case = False, na = False)
list_cat["host_location"][fil] = "Dublin, Ireland"
list_cat["host_location"]

0                     Dublin, Ireland
2                     Dublin, Ireland
3                     Dublin, Ireland
4                     Dublin, Ireland
5       Tralee, County Kerry, Ireland
                    ...              
6971                  Dublin, Ireland
6972                  Dublin, Ireland
6973                  Dublin, Ireland
6974                  Dublin, Ireland
6975                               IE
Name: host_location, Length: 6938, dtype: object

Could be reasonable to use the home country of the owner. Download csv with all countries.

In [213]:
country_abr = pd.read_csv("https://gist.githubusercontent.com/radcliff/f09c0f88344a7fcef373/raw/2753c482ad091c54b1822288ad2e4811c021d8ec/wikipedia-iso-country-codes.csv")
country_list = list(country_abr.iloc[:,0])
abr_list = list(country_abr.iloc[:,1])

In [215]:
list_cat["host_location_country"] = list_cat["host_location"].copy()

for i in list(country_list):
    fil = list_cat["host_location"].str.contains(i, case = False, na = False)
    list_cat["host_location_country"][fil] = i

for i,j in enumerate(list(abr_list)):
    fil = list_cat["host_location"].str.contains(str(j), case = True, na = False)
    list_cat["host_location_country"][fil] = country_list[i]

list_cat["host_location_country"].value_counts()


  fil = list_cat["host_location"].str.contains(i, case = False, na = False)


Ireland                    5902
United Kingdom              406
France                       94
United States                88
Spain                        64
                           ... 
Jordan                        1
Greece                        1
South Africa                  1
Prague, Prague, Czechia       1
Albania                       1
Name: host_location_country, Length: 64, dtype: int64

Surprisingly most of the hosts come from ireland

In [216]:
pd.unique(list_cat["host_location_country"])

array(['Ireland', 'Turkey', 'Singapore', 'Luxembourg', 'United States',
       'London', 'Italy', 'Germany', 'Canada', 'United Kingdom', 'Spain',
       'Belgium', 'Mexico', 'France', 'Sweden', 'United Arab Emirates',
       'Denmark', 'Netherlands', 'Colombia', 'Norway', 'Australia',
       'Switzerland', 'Prague, Prague, Czechia', 'Argentina',
       'South Africa', 'Brazil', 'Greece', 'Austria', 'Jordan', 'Serbia',
       'Poland', 'Thailand', 'Portugal', 'Chad',
       'Jeju-si, Jeju-do, South Korea', 'Chile', 'Kaohsiung City, Taiwan',
       'New Zealand', 'Palau', 'Czech Republic', 'India', 'Iceland',
       'Bulgaria', 'Maracaibo, Venezuela', 'Croatia', 'China', 'Israel',
       'Moscow, Russia', 'Cayman Islands', 'Réunion', 'Lebanon',
       'Gold Coast, Queensland', 'Taichung, Taiwan',
       '53.357852, -6.259787',
       'Ho Chi Minh City, Ho Chi Minh City, Vietnam', "Côte d'Ivoire",
       'Korea, Republic of', 'Cambodia', 'Romania', 'Bahrain', 'Finland',
       'Latvia', '

### host_is_superhost

In [217]:
list_cat["host_is_superhost"].value_counts()

f    5711
t    1227
Name: host_is_superhost, dtype: int64

seems fine

### host_identity_verified

In [218]:
list_cat["host_identity_verified"].value_counts()

t    4946
f    1992
Name: host_identity_verified, dtype: int64

seems fine too

### neighbourhood_cleansed

In [219]:
list_cat["neighbourhood_cleansed"].value_counts()

Dublin City              5309
Dn Laoghaire-Rathdown     726
Fingal                    627
South Dublin              276
Name: neighbourhood_cleansed, dtype: int64

perfect

### bathrooms_text

In [220]:
list_cat["bathrooms_text"].value_counts()

1 bath               2402
1 shared bath        1290
1 private bath       1055
2 baths               774
1.5 baths             428
1.5 shared baths      284
2.5 baths             228
3 baths               125
2 shared baths        119
3.5 baths              49
4 baths                38
2.5 shared baths       28
0 baths                24
Private half-bath      18
0 shared baths         16
3 shared baths         13
4.5 baths              11
Shared half-bath        8
Half-bath               7
5 baths                 5
4 shared baths          4
5.5 baths               3
6 baths                 2
3.5 shared baths        2
7.5 baths               1
6.5 baths               1
8.5 baths               1
9 baths                 1
6 shared baths          1
Name: bathrooms_text, dtype: int64

a lot of different options but everythings seems fine

### has_availability

In [221]:
list_cat["has_availability"].value_counts()

t    6781
f     157
Name: has_availability, dtype: int64

nice

### instant_bookable

In [222]:
list_cat["instant_bookable"].value_counts()

f    4641
t    2297
Name: instant_bookable, dtype: int64

cool cool cool

## ANOVA
Let's put every categorical variable in a ANOVA to find out whether the price is significantly different across two or more categories

In [223]:
pd.get_dummies(list_cat["host_location_new"])

KeyError: 'host_location_new'

In [227]:
pd.Categorical(list_cat["host_is_superhost"])

['t', 't', 'f', 't', 't', ..., 'f', 'f', 'f', 'f', 'f']
Length: 6938
Categories (2, object): ['f', 't']

In [229]:
feature_selection.f_classif(X = pd.Categorical(list_cat["host_is_superhost"]), y = price)

ValueError: could not convert string to float: 't'