In [29]:
import pandas as pd
import sklearn as skl
from sklearn import feature_selection
from load_data import load_data
pd.set_option('display.max_columns', None)
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from scipy.stats import ttest_ind
from scipy.stats import kruskal



from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
price, listings, reviews = load_data()

  price = price.str.replace("$","")


In [4]:
c = ["host_location", "host_is_superhost", "host_identity_verified", "neighbourhood_cleansed", "bathrooms_text", "has_availability", "instant_bookable"]
list_cat = listings.filter(c)

# Inspect columns

First we can have a look at the amount of NAs per feature

In [5]:
list_cat.isna().sum()

host_location             32
host_is_superhost         11
host_identity_verified    11
neighbourhood_cleansed     0
bathrooms_text             6
has_availability           0
instant_bookable           0
dtype: int64

There are only a few missing values for the host_location, host_is_superhost and host_identity_verfied. We will just drop them for the moment.

In [6]:
na_filter = list_cat.isna().any(axis = 1)
price = price[~na_filter]
list_cat = list_cat.dropna()
list_cat.isna().sum()


host_location             0
host_is_superhost         0
host_identity_verified    0
neighbourhood_cleansed    0
bathrooms_text            0
has_availability          0
instant_bookable          0
dtype: int64

## Lets have a look at each variable

### host_location

In [7]:
pd.unique(list_cat["host_location"])

array(['Dublin  Ireland', 'Dublin, County Dublin, Ireland',
       'Tralee, County Kerry, Ireland', 'Dublin, Dublin, Ireland',
       'Lucan, Co.Dublin. Ireland', 'County Kildare, Ireland',
       'County Dublin, Ireland', 'Salvador Brazil, and Dublin Ireland',
       'Lucan, County Dublin, Ireland', 'Istanbul, İstanbul, Turkey',
       'Ireland', 'Singapore, Singapore', 'Rathfarnham, Dublin, Ireland',
       'Trim, County Meath, Ireland', 'Dublin, Leinster, Ireland',
       'Dublin 2, County Dublin, Ireland', 'Smithfield, Dublin, Ireland',
       'ireland', 'Dublin 8, County Dublin, Ireland',
       'Sallynoggin, County Dublin, Ireland',
       'Luxembourg, Luxembourg District, Luxembourg',
       'New York, New York, United States', 'London',
       'Dublin 6, County Dublin, Ireland', 'Cupramontana, Marche, Italy',
       'Ranelagh, Dublin, Ireland',
       'Tübingen, Baden-Württemberg, Germany', 'Toronto, Ontario, Canada',
       'Glasnevin, Dublin, Ireland', 'County Cork, Ireland',

There are a lot of different entries for Dublin. Might be reasonable to store them in one category.
Therefore turn all values containing Dublin into one category "Dublin, Ireland"

In [8]:
fil = list_cat["host_location"].str.contains("Dublin|DB", case = False, na = False)
list_cat["host_location"][fil] = "Dublin, Ireland"
list_cat["host_location"]

0                     Dublin, Ireland
2                     Dublin, Ireland
3                     Dublin, Ireland
4                     Dublin, Ireland
5       Tralee, County Kerry, Ireland
                    ...              
6971                  Dublin, Ireland
6972                  Dublin, Ireland
6973                  Dublin, Ireland
6974                  Dublin, Ireland
6975                               IE
Name: host_location, Length: 6938, dtype: object

Could be reasonable to use the home country of the owner. Download csv with all countries.

In [9]:
country_abr = pd.read_csv("https://gist.githubusercontent.com/radcliff/f09c0f88344a7fcef373/raw/2753c482ad091c54b1822288ad2e4811c021d8ec/wikipedia-iso-country-codes.csv")
country_list = list(country_abr.iloc[:,0])
abr_list = list(country_abr.iloc[:,1])

In [10]:
list_cat["host_location_country"] = list_cat["host_location"].copy()

for i in list(country_list):
    fil = list_cat["host_location"].str.contains(i, case = False, na = False)
    list_cat["host_location_country"][fil] = str(i)

for i,j in enumerate(list(abr_list)):
    fil = list_cat["host_location"].str.contains(str(j), case = True, na = False)
    list_cat["host_location_country"][fil] = str(country_list[i])

list_cat["host_location_country"].value_counts()


  fil = list_cat["host_location"].str.contains(i, case = False, na = False)


Ireland                    5902
United Kingdom              406
France                       94
United States                88
Spain                        64
                           ... 
Jordan                        1
Greece                        1
South Africa                  1
Prague, Prague, Czechia       1
Albania                       1
Name: host_location_country, Length: 64, dtype: int64

Surprisingly most of the hosts come from ireland. However we still have a lot of different categories/countries.

In [11]:
other_filter = list_cat["host_location_country"].value_counts() <= 5
other_list = list(list_cat["host_location_country"].value_counts().index[other_filter])

for i, j in enumerate(other_list):
    fil = list_cat["host_location_country"].str.contains(j, case = True, na = False)
    list_cat["host_location_country"][fil] = "Others"

In [12]:
list_cat["host_location_country"].value_counts()

Ireland                 5902
United Kingdom           406
France                    94
United States             88
Others                    72
Spain                     64
India                     48
Germany                   40
Italy                     32
Brazil                    32
Australia                 27
Canada                    19
Netherlands               15
Israel                    13
Switzerland               12
53.357852, -6.259787      12
Sweden                    10
Turkey                     9
Portugal                   7
Argentina                  6
Poland                     6
Mexico                     6
Bulgaria                   6
Croatia                    6
Belgium                    6
Name: host_location_country, dtype: int64

Looking up "53.357852, -6.259787" on google maps leads to an address in Dublin

In [13]:
list_cat["host_location_country"][list_cat["host_location_country"] == "53.357852, -6.259787"] = "Ireland"

### host_is_superhost

In [14]:
list_cat["host_is_superhost"].value_counts()

f    5711
t    1227
Name: host_is_superhost, dtype: int64

seems fine

### host_identity_verified

In [15]:
list_cat["host_identity_verified"].value_counts()

t    4946
f    1992
Name: host_identity_verified, dtype: int64

seems fine too

### neighbourhood_cleansed

In [16]:
list_cat["neighbourhood_cleansed"].value_counts()

Dublin City              5309
Dn Laoghaire-Rathdown     726
Fingal                    627
South Dublin              276
Name: neighbourhood_cleansed, dtype: int64

perfect

### bathrooms_text

In [17]:
list_cat["bathrooms_text"].value_counts()

1 bath               2402
1 shared bath        1290
1 private bath       1055
2 baths               774
1.5 baths             428
1.5 shared baths      284
2.5 baths             228
3 baths               125
2 shared baths        119
3.5 baths              49
4 baths                38
2.5 shared baths       28
0 baths                24
Private half-bath      18
0 shared baths         16
3 shared baths         13
4.5 baths              11
Shared half-bath        8
Half-bath               7
5 baths                 5
4 shared baths          4
5.5 baths               3
6 baths                 2
3.5 shared baths        2
7.5 baths               1
6.5 baths               1
8.5 baths               1
9 baths                 1
6 shared baths          1
Name: bathrooms_text, dtype: int64

Some of the categories for the bathroom should be replaced. Half bath can be turned into 0.5 which means just a toilet and a sink without a shower. Then one can differentiate between shared, private (in the room) and a normal bathroom. We construct two variables - one containing the number of bathrooms another the kind of bathrooms howevere "private" or "shared" might indicate that only one of the bathroom is private or shared.

In [18]:
bath = list_cat["bathrooms_text"]
bath_kind = bath.copy()

shared = bath.str.contains("shared", case = False)
private = bath.str.contains("private", case = False)
normal = ~pd.concat([shared, private], axis = 1).any(axis = 1)

bath_kind[shared] = "Shared"
bath_kind[private] = "Private"
bath_kind[normal] = "Normal"

list_cat["bath_kind"] = bath_kind
bath_kind.value_counts()


Normal     4100
Shared     1765
Private    1073
Name: bathrooms_text, dtype: int64

In [19]:
bath_number = bath.copy()
bath_number = bath_number.str.replace("half", "0.5", case = False)
bath_number = bath_number.str.extract('(\d+.\d|\d+)')

list_cat["bath_number"] = bath_number
bath_number.value_counts()

1      4747
2       893
1.5     712
2.5     256
3       138
3.5      51
4        42
0        40
0.5      33
4.5      11
5         5
5.5       3
6         3
6.5       1
7.5       1
8.5       1
9         1
dtype: int64

a lot of different options but everythings seems fine

### has_availability

In [20]:
list_cat["has_availability"].value_counts()

t    6781
f     157
Name: has_availability, dtype: int64

nice

### instant_bookable

In [21]:
list_cat["instant_bookable"].value_counts()

f    4641
t    2297
Name: instant_bookable, dtype: int64

cool cool cool

## t-Test
Let's put every binary variable in a Welch t-Test

In [68]:
stats_val = []
p_val = []
names = []

def t_Test(X, y, stats, p_val, names):
    catg = pd.unique(X)
    catg_filter = (X == catg[0])
    sample1 = y[catg_filter]
    sample2 = y[~catg_filter]
    
    t, p = ttest_ind(sample1, sample2, equal_var = False)
    name = "t: "+X.name

    stats.append(t)
    p_val.append(p)
    names.append(name)


    return t, p

### has_availability

In [69]:
t_Test(list_cat["has_availability"], price, stats_val, p_val, names)


(1.0657211124199206, 0.2865872269028238)

not significant

### instant_bookable

In [70]:
t_Test(list_cat["instant_bookable"], price, stats_val, p_val, names)



(-0.9790457910817654, 0.32766049343717996)

not significant

### host_identity_verified

In [71]:
t_Test(list_cat["host_identity_verified"], price, stats_val, p_val, names)



(-0.9530251888678685, 0.34069298191598285)

not significant 

### host_is_superhost

In [72]:
t_Test(list_cat["host_is_superhost"], price, stats_val, p_val, names)



(-1.002327937628808, 0.3162274720597048)

not significant

### host_location_country (Ireland or not)

In [73]:
X = list_cat["host_location_country"].where(list_cat["host_location_country"] == "Ireland", "else")
t_Test(X, price, stats_val, p_val, names)



(0.680093604786456, 0.49647167072191534)

not significant

## ANOVA 
so lets find out whether one of the multicategorical variables have at least one group that is significantly different from the others. Therefore we don't use the common ANOVA as the assumption of equal variances across groups is usually not met. Instead we use the Kruskal-Test which is implemented in scipy

### host_location_country

In [74]:
def krus_test(X, y, stats, p_val, names):
    c_list = pd.unique(X)

    F, p = kruskal(*[list(y[X == i]) for i in c_list])
    name = "F: " + X.name

    stats.append(F)
    p_val.append(p)
    names.append(name)

    return F, p

In [75]:
krus_test(list_cat["host_location_country"], price, stats_val, p_val, names)


(89.51800805489069, 8.563050545434996e-10)

significant

### bath_kind

In [76]:
krus_test(list_cat["bath_kind"], price, stats_val, p_val, names)


(1989.2204637395907, 0.0)

significant

### neighbourhood_cleansed

In [77]:
stat, p = krus_test(list_cat["neighbourhood_cleansed"], price, stats_val, p_val, names)


significant

# Summary

In [78]:
d = {"Variable": names, "t-/F-Statistic": stats_val, "p-value": p_val}
pd.DataFrame(d)

Unnamed: 0,Variable,t-/F-Statistic,p-value
0,t: has_availability,1.065721,0.2865872
1,t: instant_bookable,-0.979046,0.3276605
2,t: host_identity_verified,-0.953025,0.340693
3,t: host_is_superhost,-1.002328,0.3162275
4,t: host_location_country,0.680094,0.4964717
5,F: host_location_country,89.518008,8.563051e-10
6,F: bath_kind,1989.220464,0.0
7,F: neighbourhood_cleansed,134.493109,5.816696e-29


the multicategorical variables seem to explain some of the variance in price and should be taken into account 