In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
%matplotlib inline

In [2]:
df = pd.read_csv('Project/data_files/san_francisco/airbnb_union.csv')

#Setting datetime type
df['date_collected'] = df['date_collected'].astype('datetime64[ns]')
#Creating year column
df ['year'] = df['date_collected'].dt.year

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205532 entries, 0 to 205531
Data columns (total 16 columns):
room_id                 205532 non-null int64
host_id                 205526 non-null float64
room_type               205489 non-null object
borough                 0 non-null float64
neighborhood            205532 non-null object
reviews                 205483 non-null float64
overall_satisfaction    159178 non-null float64
accommodates            197111 non-null float64
bedrooms                194842 non-null float64
price                   205532 non-null int64
minstay                 134055 non-null float64
latitude                205532 non-null float64
longitude               205532 non-null float64
last_modified           205532 non-null object
date_collected          205532 non-null datetime64[ns]
year                    205532 non-null int64
dtypes: datetime64[ns](1), float64(9), int64(3), object(3)
memory usage: 25.1+ MB


In [28]:
df['room_type'].unique()

array(['Private room', 'Entire home/apt', 'Shared room', nan], dtype=object)

In [95]:
#Getting count of units per host and year, entire home only
ent_home_group = df[df['room_type'] == 'Entire home/apt'].\
groupby(['year','host_id'])['room_id'].nunique().to_frame().rename(columns={'room_id':'home_count'}).\
unstack(level=0)

ent_home_group.columns = ent_home_group.columns.droplevel()
ent_home_group['room_type'] = 'Entire home/apt'

In [96]:
ent_home_group

year,2013,2014,2015,2016,2017,room_type
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
46.0,,1.0,1.0,1.0,1.0,Entire home/apt
240.0,,1.0,1.0,1.0,,Entire home/apt
275.0,,1.0,1.0,,,Entire home/apt
316.0,,,1.0,1.0,1.0,Entire home/apt
426.0,,2.0,,,,Entire home/apt
470.0,,,1.0,1.0,,Entire home/apt
521.0,,,,1.0,,Entire home/apt
878.0,,1.0,1.0,,,Entire home/apt
1096.0,,1.0,1.0,1.0,1.0,Entire home/apt
1169.0,,1.0,1.0,1.0,1.0,Entire home/apt


In [97]:
#Getting count of units per host and year, private homes only
private_room_group = df[df['room_type'] == 'Private room'].\
groupby(['year','host_id'])['room_id'].nunique().to_frame().rename(columns={'room_id':'home_count'}).\
unstack(level=0)

private_room_group.columns = private_room_group.columns.droplevel()
private_room_group['room_type'] = 'Private room'

In [98]:
#Getting count of units per host and year, shared rooms only
shared_room_group = df[df['room_type'] == 'Shared room'].\
groupby(['year','host_id'])['room_id'].nunique().to_frame().rename(columns={'room_id':'home_count'}).\
unstack(level=0)

shared_room_group.columns = shared_room_group.columns.droplevel()
shared_room_group['room_type'] = 'Shared room'

In [122]:
all_groups = pd.concat([ent_home_group, private_room_group, shared_room_group])

#Converting index from float to int
all_groups.index = all_groups.index.astype(int)

In [123]:
all_groups.head()

year,2013,2014,2015,2016,2017,room_type
host_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
46,,1.0,1.0,1.0,1.0,Entire home/apt
240,,1.0,1.0,1.0,,Entire home/apt
275,,1.0,1.0,,,Entire home/apt
316,,,1.0,1.0,1.0,Entire home/apt
426,,2.0,,,,Entire home/apt


In [140]:
#Changing year from wide to long
all_groups['host_id'] = all_groups.index
all_groups_long = all_groups.melt(id_vars=['host_id','room_type'],value_name='count')

In [141]:
all_groups_long.to_csv('number_of_units.csv')

In [144]:
all_groups_long[all_groups_long['count'] > 1].tail()

Unnamed: 0,host_id,room_type,year,count
100541,104582772,Shared room,2017,4.0
100545,113608325,Shared room,2017,3.0
100547,117141107,Shared room,2017,3.0
100550,119355432,Shared room,2017,2.0
100554,123914250,Shared room,2017,3.0


In [148]:
df[df['host_id'] == 470]

Unnamed: 0,room_id,host_id,room_type,borough,neighborhood,reviews,overall_satisfaction,accommodates,bedrooms,price,minstay,latitude,longitude,last_modified,date_collected,year
14327,4751080,470.0,Private room,,Mission,0.0,,2.0,1.0,173,2.0,37.755592,-122.408418,2015-02-19 20:23:48.474,2015-02-19,2015
14328,4751090,470.0,Entire home/apt,,Mission,1.0,,4.0,2.0,249,2.0,37.755022,-122.40881,2015-02-19 20:39:22.013,2015-02-19,2015
124489,15344978,470.0,Entire home/apt,,Noe Valley,0.0,,3.0,1.0,75,2.0,37.750712,-122.425213,2016-10-19 20:38:05.917588,2016-10-19,2016


In [150]:
all_groups_long[all_groups_long['host_id'] == 329072]

Unnamed: 0,host_id,room_type,year,count
19460,329072,Shared room,2013,15.0
39572,329072,Shared room,2014,13.0
59684,329072,Shared room,2015,12.0
79796,329072,Shared room,2016,23.0
99908,329072,Shared room,2017,16.0
