In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from scipy import stats
import seaborn as sns
from scipy.linalg import sqrtm
from sklearn.metrics import mean_squared_error
from math import sqrt
import re

plt.style.use('fivethirtyeight')

In [None]:
DATA_URL = 'https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data_mmwd/merged_listings_and_polarity.csv'

In [None]:
df = pd.read_csv(DATA_URL)

In [None]:
df.shape

(1981, 22)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,id,listing_url,scrape_id,last_scraped,name,description,latitude,longitude,property_type,room_type,accommodates,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,polarity
0,0,0,36642,https://www.airbnb.com/rooms/36642,20201021220716,2020-10-22,City Conveniences A Country Setting,<b>The space</b><br />Are you planning a vacat...,45.40159,-75.74679,Private room in house,Private room,2,2 baths,1.0,1.0,"[""Essentials"", ""Hot water"", ""Dryer"", ""Extra pi...",$65.00,87,1,0,0.807
1,1,1,59258,https://www.airbnb.com/rooms/59258,20201021220716,2020-10-22,Lower level of my house to rent,"Lower 1/2 of bungalow, a separate entrance, 10...",45.359,-75.80314,Entire apartment,Entire home/apt,4,1 bath,1.0,1.0,"[""Wifi"", ""Washer"", ""Dryer"", ""Heating"", ""Free p...",$200.00,1,0,0,0.985
2,2,2,158824,https://www.airbnb.com/rooms/158824,20201021220716,2020-10-23,DOWNTOWN/GLEBE HOUSE WITH BACKYARD,Big 3 bedroom house with backyard in desirable...,45.40248,-75.69941,Entire house,Entire home/apt,3,1 bath,2.0,2.0,"[""Wifi"", ""Essentials"", ""Smoke alarm"", ""Washer""...",$140.00,7,0,0,0.743
3,3,3,261065,https://www.airbnb.com/rooms/261065,20201021220716,2020-10-23,Westboro Village Executive Suite,A tastefully decorated and well equipped upper...,45.38972,-75.75496,Entire apartment,Entire home/apt,4,1 bath,1.0,1.0,"[""Essentials"", ""Hot water"", ""Dryer"", ""Stove"", ...",$150.00,64,6,1,0.8304
4,4,4,336692,https://www.airbnb.com/rooms/336692,20201021220716,2020-10-23,Quiet Room with a view in the Ottawa Downtown,Renting only to females. Modern condo with pan...,45.41557,-75.70551,Private room in apartment,Private room,5,1 shared bath,1.0,1.0,"[""Gym"", ""Luggage dropoff allowed"", ""Essentials...",$69.00,9,0,0,0.9537


In [None]:
df['price'] = df['price'].str.replace('$', '')
df['price'] = df['price'].str.replace(',', '')

In [None]:
df['price'] = df['price'].astype(float)

In [None]:
df['price'].describe()

count    1981.000000
mean      106.856911
std        98.878178
min         0.000000
25%        55.000000
50%        85.000000
75%       125.000000
max      2000.000000
Name: price, dtype: float64

In [None]:
df.columns

Index(['Unnamed: 0', 'index', 'id', 'listing_url', 'scrape_id', 'last_scraped',
       'name', 'description', 'latitude', 'longitude', 'property_type',
       'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds',
       'amenities', 'price', 'number_of_reviews', 'number_of_reviews_ltm',
       'number_of_reviews_l30d', 'polarity'],
      dtype='object')

In [None]:
df = df.drop(['Unnamed: 0', 'index', 'listing_url', 'scrape_id', 'last_scraped', 'property_type', 'amenities', 'number_of_reviews_ltm', 'number_of_reviews_l30d'], axis=1)

In [None]:
df['accommodates'].unique()

array([ 2,  4,  3,  5,  1,  7,  6,  8, 12, 10, 15,  9, 13, 14, 16, 11])

In [None]:
df.columns = ['id', 'name', 'description', 'latitude', 'longitude', 'room_type',
       'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price',
       'number_of_reviews', 'polarity']

In [None]:
df.head()

Unnamed: 0,id,name,description,latitude,longitude,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,polarity
0,36642,City Conveniences A Country Setting,<b>The space</b><br />Are you planning a vacat...,45.40159,-75.74679,Private room,2,2 baths,1.0,1.0,65.0,87,0.807
1,59258,Lower level of my house to rent,"Lower 1/2 of bungalow, a separate entrance, 10...",45.359,-75.80314,Entire home/apt,4,1 bath,1.0,1.0,200.0,1,0.985
2,158824,DOWNTOWN/GLEBE HOUSE WITH BACKYARD,Big 3 bedroom house with backyard in desirable...,45.40248,-75.69941,Entire home/apt,3,1 bath,2.0,2.0,140.0,7,0.743
3,261065,Westboro Village Executive Suite,A tastefully decorated and well equipped upper...,45.38972,-75.75496,Entire home/apt,4,1 bath,1.0,1.0,150.0,64,0.8304
4,336692,Quiet Room with a view in the Ottawa Downtown,Renting only to females. Modern condo with pan...,45.41557,-75.70551,Private room,5,1 shared bath,1.0,1.0,69.0,9,0.9537


In [None]:
df['bathrooms'].unique()

array(['2 baths', '1 bath', '1 shared bath', '1 private bath',
       '1.5 baths', '2.5 shared baths', '2.5 baths', '3 baths',
       '2 shared baths', '1.5 shared baths', '3.5 baths',
       'Private half-bath', '0 baths', '0 shared baths', '4 baths',
       '6 baths', '4.5 baths', '3 shared baths', '5.5 baths',
       '4 shared baths'], dtype=object)

In [None]:
df['bathrooms'] = df['bathrooms'].replace(',', '.')

In [None]:
df['bathrooms'] = df['bathrooms'].str.replace(r'[^0-9]+', '')

In [None]:
df['bathrooms'].unique()

array(['2', '1', '15', '25', '3', '35', '', '0', '4', '6', '45', '55'],
      dtype=object)

In [None]:
df['bathrooms'] = df['bathrooms'].replace('', '1')

In [None]:
df['bathrooms'].unique()

array(['2', '1', '15', '25', '3', '35', '0', '4', '6', '45', '55'],
      dtype=object)

In [None]:
bathrooms = df['bathrooms'].values
new_bathrooms = []

for bathroom in bathrooms: 
  if len(bathroom) == 1:
    new_bathrooms.append(bathroom[0])
  else:
    new_bathrooms.append(f'{int(bathroom[0])+1}')

In [None]:
df['bathrooms'] = new_bathrooms

In [None]:
df['bathrooms'] = df['bathrooms'].astype(int)

In [None]:
df['bedrooms'] = df['bedrooms'].astype(int)

In [None]:
df['beds'] = df['beds'].astype(int)

In [None]:
df['bathrooms'].value_counts()

1    1276
2     496
3     151
4      49
0       4
5       3
6       2
Name: bathrooms, dtype: int64

In [None]:
df['bedrooms'].value_counts()

1    1215
2     377
3     261
4      96
5      19
6      11
9       1
8       1
Name: bedrooms, dtype: int64

In [None]:
df['beds'].value_counts()

1     956
2     445
3     275
4     152
0      52
5      43
6      25
7      14
8       7
9       6
12      2
10      2
16      1
13      1
Name: beds, dtype: int64

In [None]:
df.head()

Unnamed: 0,id,name,description,latitude,longitude,room_type,accommodates,bathrooms,bedrooms,beds,price,number_of_reviews,polarity
0,36642,City Conveniences A Country Setting,<b>The space</b><br />Are you planning a vacat...,45.40159,-75.74679,Private room,2,2,1,1,65.0,87,0.807
1,59258,Lower level of my house to rent,"Lower 1/2 of bungalow, a separate entrance, 10...",45.359,-75.80314,Entire home/apt,4,1,1,1,200.0,1,0.985
2,158824,DOWNTOWN/GLEBE HOUSE WITH BACKYARD,Big 3 bedroom house with backyard in desirable...,45.40248,-75.69941,Entire home/apt,3,1,2,2,140.0,7,0.743
3,261065,Westboro Village Executive Suite,A tastefully decorated and well equipped upper...,45.38972,-75.75496,Entire home/apt,4,1,1,1,150.0,64,0.8304
4,336692,Quiet Room with a view in the Ottawa Downtown,Renting only to females. Modern condo with pan...,45.41557,-75.70551,Private room,5,1,1,1,69.0,9,0.9537


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1981 entries, 0 to 1980
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1981 non-null   int64  
 1   name               1981 non-null   object 
 2   description        1981 non-null   object 
 3   latitude           1981 non-null   float64
 4   longitude          1981 non-null   float64
 5   room_type          1981 non-null   object 
 6   accommodates       1981 non-null   int64  
 7   bathrooms          1981 non-null   int64  
 8   bedrooms           1981 non-null   int64  
 9   beds               1981 non-null   int64  
 10  price              1981 non-null   float64
 11  number_of_reviews  1981 non-null   int64  
 12  polarity           1981 non-null   float64
dtypes: float64(4), int64(6), object(3)
memory usage: 201.3+ KB


In [None]:
df.to_csv('listings_polarity_preprocess.csv', index=False)