In [1]:
import pandas as pd
import numpy as np

In [2]:
airbnb = pd.read_csv('airbnb.csv')
airbnb.head(5)

Unnamed: 0,room_id,survey_id,host_id,room_type,country,city,borough,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,bathrooms,price,minstay,name,last_modified,latitude,longitude,location
0,10176931,1476,49180562,Shared room,,Amsterdam,,De Pijp / Rivierenbuurt,7,4.5,2,1.0,,156.0,,Red Light/ Canal view apartment (Shared),2017-07-23 13:06:27.391699,52.356209,4.887491,0101000020E610000033FAD170CA8C13403BC5AA41982D...
1,8935871,1476,46718394,Shared room,,Amsterdam,,Centrum West,45,4.5,4,1.0,,126.0,,Sunny and Cozy Living room in quite neighbours,2017-07-23 13:06:23.607187,52.378518,4.89612,0101000020E6100000842A357BA095134042791F477330...
2,14011697,1476,10346595,Shared room,,Amsterdam,,Watergraafsmeer,1,0.0,3,1.0,,132.0,,Amsterdam,2017-07-23 13:06:23.603546,52.338811,4.943592,0101000020E6100000A51133FB3CC613403543AA285E2B...
3,6137978,1476,8685430,Shared room,,Amsterdam,,Centrum West,7,5.0,4,1.0,,121.0,,Canal boat RIDE in Amsterdam,2017-07-23 13:06:22.689787,52.376319,4.890028,0101000020E6100000DF180280638F134085EE92382B30...
4,18630616,1476,70191803,Shared room,,Amsterdam,,De Baarsjes / Oud West,1,0.0,2,1.0,,93.0,,One room for rent in a three room appartment,2017-07-23 13:06:19.681469,52.370384,4.852873,0101000020E6100000CD902A8A57691340187B2FBE682F...


## Q1 - What is the correlation (pearson) between reviews and overall_satisfaction (2 decimal houses)?

In [3]:
cor = airbnb.corr(method = 'pearson')

correlation_value = cor['reviews']['overall_satisfaction']

# OR, since it is the same

correlation_value = cor['overall_satisfaction']['reviews']

print("The correlation value, rounded to 2 decimal cases is {:.2f}".format(correlation_value))

The correlation value, rounded to 2 decimal cases is 0.32


## Q2 - The most frequent neighborhood is...

In [4]:
# Count frequency of the values of neighborhood
CountNeighborhood = airbnb['neighborhood'].value_counts()
#Get the label for the highest one
Max = CountNeighborhood.idxmax()
print("The most common neighborhood is {}".format(Max))

The most common neighborhood is De Baarsjes / Oud West


## Q3 - What is the mean price for houses with an overall_satisfation of 5.0 (1 decimal houses)?

In [5]:
# First Filter the dataframe and then compute the mean
M1 = airbnb[airbnb['overall_satisfaction'] == 5.0]['price'].mean()
print("1. Method : The mean price for overall_satisfaction is {:.1f}".format(M1))

# First aggregate the dataframe, compute the mean and then filter the results
M2 = airbnb.groupby('overall_satisfaction')['price'].mean().filter(like='5.', axis=0).values[0]
print("2. Method : The mean price for overall_satisfaction is {:.1f}".format(M2))

1. Method : The mean price for overall_satisfaction is 170.2
2. Method : The mean price for overall_satisfaction is 170.2


## Q4 - The most frequent type in 'room_type' is...

In [6]:
# Same as Q2
CountRoomType = airbnb['room_type'].value_counts()
Max = CountRoomType.idxmax()
print("The most common room_type is {}".format(Max))

The most common room_type is Entire home/apt


## Q5 - How many houses accommodates only one person?

In [7]:
N_housesFor1Person = airbnb[airbnb['accommodates'] == 1].shape[0]
print("The number of houses which can only accomodate 1 person is {}".format(N_housesFor1Person))

The number of houses which can only accomodate 1 person is 367


## Q6 - How many integer variables do we have in our dataset?

In [8]:
# You can either count it manually using airbnb.info() or you can do it programatically
n_ints = 0
for column in airbnb.columns:
    if airbnb[column].dtype == np.int64:
        n_ints += 1

print("We have {} integer columns.".format(n_ints))

We have 5 integer columns.


## Q7 - How many variables do not have any valid observations (all observations are missing)?

In [9]:
# Once again, count manually with airbnb.info() or just do it programatically
counts = airbnb.count()
columnsWith0Rows = len(counts[counts == 0])

print("We have {} empty columns.".format(columnsWith0Rows))

We have 4 empty columns.


## Q8 - How many observations does the dataset airbnb have?

In [10]:
print("We have {} observations/rows.".format(airbnb.shape[0]))

We have 18723 observations/rows.


## Q9 - What is the variable that has the higher correlation (Pearson) with the target (price)?

In [11]:
aux_priceCor = cor['price']
priceCor = aux_priceCor.drop(labels=['price'])

print("The most correlated variable is {} with a correlation of {:.2f}.".format(priceCor.idxmax(),priceCor[priceCor.idxmax()]))

The most correlated variable is accommodates with a correlation of 0.50.


## Q10 - Do we have missing values in the dataset airbnb?

In [12]:
NumberOfObservations = airbnb.shape[0]

for column in airbnb.columns:
    if airbnb[column].isnull().values.any():
        print("Column '{}' has {} missing values.".format(column,airbnb[column].isnull().sum()))

Column 'country' has 18723 missing values.
Column 'borough' has 18723 missing values.
Column 'bathrooms' has 18723 missing values.
Column 'minstay' has 18723 missing values.
Column 'name' has 52 missing values.


## Q11 - What is the mean price of a house that hosts 16 people?

In [13]:
M1 = airbnb[airbnb['accommodates'] == 16]['price'].mean()
print("The mean price for a house that accommodates 16 people is {:.2f}".format(M1))

The mean price for a house that accommodates 16 people is 745.70


## Q12 - How many houses can accommodate more than 15 people?

In [14]:
nhouses = airbnb[airbnb['accommodates'] > 15].shape[0]

print("We have {} houses who can accommodate more that 15 people.".format(nhouses))

We have 21 houses who can accommodate more that 15 people.


## Q13 - What is the maximum number of bedrooms available?

In [15]:
maxbedrooms = airbnb['bedrooms'].max()

print("The max number of bedrooms is {}.".format(maxbedrooms))

The max number of bedrooms is 10.0.


## Q14 - Half of the houses have a value lower than...

In [16]:
PriceQuartile50 = airbnb.describe()['price'].filter(like='50%', axis=0).values[0]

print("Half of the houses have a value lower than {}.".format(PriceQuartile50))

Half of the houses have a value lower than 144.0.


## Q15 - What is the mean value of overall_satisfaction (one decimal house)?

In [17]:
M1 = airbnb['overall_satisfaction'].mean()
print("The mean overall_satisfaction is {:.1f}".format(M1))

The mean overall_satisfaction is 3.3
