In [129]:
import pandas as pd
import datetime
import numpy as np
pd.set_option("display.max_columns",999)
pd.set_option("display.max_rows",999)
import warnings
warnings.filterwarnings('ignore')

In [130]:
orig = pd.read_csv('lease_clean_Nov22.csv', low_memory=False)

#Read the submarket rent data shared by Rob
submarket = pd.read_csv('Rent Estimate by SubmarketBuildingrating.csv', low_memory=False).\
    drop(columns=['RentDenominator'])

#calculate current estimated rent
orig['current_estimated_rent'] = orig['estimated_rent'] * 1.02 ** (datetime.datetime.now().year - orig['year_from']) 

In [131]:
#Checking how many properties in each CBSAID and building rating
orig[(orig['submarket_name'] == 'Anaheim Hills') & (orig['cbsaid']==11244)][['cbsaid','building_rating_id']].\
            value_counts().reset_index()

Unnamed: 0,cbsaid,building_rating_id,0
0,11244.0,3,141
1,11244.0,2,77
2,11244.0,4,21


In [132]:
submarket[submarket['SubmarketName']=='Anaheim Hills']

Unnamed: 0,CBSAID,SubmarketName,PrimarySubmarketID,BuildingRatingID,Rent
3966,11244.0,Anaheim Hills,49,3.0,25.300895
5145,11244.0,Anaheim Hills,49,4.0,29.538145
6350,11244.0,Anaheim Hills,49,1.0,25.533353
7109,11244.0,Anaheim Hills,49,2.0,26.007998


In [133]:
print('submarket data:',submarket.shape)
print('original clean lease data:',orig.shape)

submarket data: (10236, 5)
original clean lease data: (1037215, 44)


# MISSING SUBMARKETS
There are 1000 submarkets that are in the original data but are missing in the new submarket data Rob shared with us

fill 1000 submarkets in submarket data using current estimated rent average from original cleaned lease table

In [134]:
total_submarkets = orig.groupby(['submarket_name', 'cbsaid', 'building_rating_id']) .\
            agg({'current_estimated_rent': 'mean',
                'property_id': 'nunique'}).reset_index()

#Join total_submarkets and submarket data frames
submarket_all = pd.merge(total_submarkets[['submarket_name', 'cbsaid', 'building_rating_id', 
                                           'current_estimated_rent', 'property_id']],
                        submarket[['SubmarketName', 'CBSAID', 'BuildingRatingID', 'Rent']],
                        left_on= ['submarket_name', 'cbsaid', 'building_rating_id'],
                        right_on = ['SubmarketName', 'CBSAID', 'BuildingRatingID'],
                        how='left')

#fills the missing submarket estimated rent data
submarket_all['Rent'].fillna(submarket_all['current_estimated_rent'], inplace=True)

submarket_all.drop(columns=['current_estimated_rent', 'SubmarketName', 'CBSAID', 'BuildingRatingID'], inplace=True)

print('total_submarkets:', total_submarkets.shape)
print('submarket:', submarket.shape)
print('submarket_all:', submarket_all.shape)

# # submarket_not = submarket_not[~(submarket_not['SubmarketName'].isnull())]
# # submarket = submarket.append(submarket_not)


total_submarkets: (10621, 5)
submarket: (10236, 5)
submarket_all: (10621, 5)


In [135]:
submarket_all

Unnamed: 0,submarket_name,cbsaid,building_rating_id,property_id,Rent
0,10th & Page MF,16820.0,3,1,23.163706
1,190th Street Corridor,31084.0,2,19,27.617587
2,190th Street Corridor,31084.0,3,18,32.272375
3,190th Street Corridor,31084.0,4,10,37.052047
4,217 Corridor/Beaverton,38900.0,2,52,20.963637
...,...,...,...,...,...
10616,Zanesville,49780.0,2,21,15.284953
10617,Zanesville,49780.0,3,4,19.266068
10618,Zapata,49820.0,2,1,15.303233
10619,Zeeland,24340.0,1,2,14.744541


In [136]:
#lets look at property count for one submarket
total_submarkets[(total_submarkets['cbsaid']==11244) & (total_submarkets['submarket_name']=='Anaheim Hills')]


Unnamed: 0,submarket_name,cbsaid,building_rating_id,current_estimated_rent,property_id
184,Anaheim Hills,11244.0,2,24.779025,18
185,Anaheim Hills,11244.0,3,24.466947,14
186,Anaheim Hills,11244.0,4,30.766967,2


### Create Quality bins

In [137]:
#creating quality bins
Low_quality = [1,2]
medium_quality = [3]
High_quality = [4,5]

### Get high_quality_mean_rent

In [138]:
high_quality_mean_rent = submarket[submarket['BuildingRatingID'].isin(High_quality)][['BuildingRatingID',
                                                                                'Rent']]
high_quality_mean_rent.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BuildingRatingID,2297.0,4.141489,0.348601,4.0,4.0,4.0,4.0,5.0
Rent,2297.0,32.763395,12.837809,8.119948,24.617965,29.343341,36.447483,126.452152


### Get medium_quality_mean_rent

In [139]:
medium_quality_mean_rent = submarket[submarket['BuildingRatingID'].isin(medium_quality)][['BuildingRatingID',
                                                                                'Rent']]
medium_quality_mean_rent.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BuildingRatingID,2698.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0
Rent,2698.0,24.186922,8.394341,9.839682,19.129742,21.992335,26.377082,110.472268


### Get low_quality_mean_rent

In [140]:
low_quality_mean_rent = submarket[submarket['BuildingRatingID'].isin(Low_quality)][['BuildingRatingID',
                                                                                'Rent']]
low_quality_mean_rent.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BuildingRatingID,5210.0,1.52476,0.499434,1.0,1.0,2.0,2.0,2.0
Rent,5210.0,20.709588,7.652301,9.526241,16.008085,18.4164,22.706545,84.978143


######  High quality Expensive > 30 (Rating 4, 5)
######  High quality less expensive < 30 (Rating 4, 5)
######  Medium Quality Expensive > 22 (Rating 3)
###### Medium Quality less expensive < 22 (Rating 3)
######  Low Quality Expensive > 18 (Rating 1, 2)
######  Low Quality Less expensive < 18 (Rating 1, 2)

In [141]:
data = {
    'bin': ['HQE', 'HQLE', 'MQE', 'MQLE', 'LQE', 'LQLE'],
    'rent_cutoff': [30, 30, 22, 22, 18, 18],
    'rating_bin' : ['High', 'High', 'Medium', 'Medium', 'Low', 'Low']
}
rent_bucket=pd.DataFrame(data)
rent_bucket

Unnamed: 0,bin,rent_cutoff,rating_bin
0,HQE,30,High
1,HQLE,30,High
2,MQE,22,Medium
3,MQLE,22,Medium
4,LQE,18,Low
5,LQLE,18,Low


## Get rating using Revenue 
####     1. Calculate Revenue-- Density of building rating  * current estimated rent  (building rating & submarket)
####     2. Which rating bracket has highest revenue
#### Check rent bucket
####     1. Compare the current estimated rent for building rating with the bin

In [142]:
#Rename property id as Density
submarket_all.rename(columns={'property_id':'Density'}, inplace=True)
submarket_all

Unnamed: 0,submarket_name,cbsaid,building_rating_id,Density,Rent
0,10th & Page MF,16820.0,3,1,23.163706
1,190th Street Corridor,31084.0,2,19,27.617587
2,190th Street Corridor,31084.0,3,18,32.272375
3,190th Street Corridor,31084.0,4,10,37.052047
4,217 Corridor/Beaverton,38900.0,2,52,20.963637
...,...,...,...,...,...
10616,Zanesville,49780.0,2,21,15.284953
10617,Zanesville,49780.0,3,4,19.266068
10618,Zapata,49820.0,2,1,15.303233
10619,Zeeland,24340.0,1,2,14.744541


In [143]:
submarket_all[submarket_all['submarket_name']=='Anaheim Hills']

Unnamed: 0,submarket_name,cbsaid,building_rating_id,Density,Rent
184,Anaheim Hills,11244.0,2,18,26.007998
185,Anaheim Hills,11244.0,3,14,25.300895
186,Anaheim Hills,11244.0,4,2,29.538145


In [144]:
#Steps to calculate the revenue
#Looking at say in one submarket which property gives me the highest revenue
#Then we will see which quality is the highest revenue maker
density=submarket_all.copy()

# Calculate the revenue
density['Revenue'] = density['Rent'] *  density['Density']

density_max = density.groupby(['submarket_name', 'cbsaid']).agg({'Revenue': 'max'}).reset_index()

density = pd.merge(density, density_max, 
                on=['submarket_name', 'cbsaid'],
                how='left')

density = density[density['Revenue_x']== density['Revenue_y']].\
            drop(columns='Revenue_x').rename(columns={'Revenue_y': 'Revenue'})
density.head()

#put them in bins of quality
density['rating_bin'] = density['building_rating_id'].apply(lambda x: 'High' if x in ([4, 5]) else\
                                                                    ('Medium' if x in ([3]) else 'Low'))
density.head()

# print(density.shape)

Unnamed: 0,submarket_name,cbsaid,building_rating_id,Density,Rent,Revenue,rating_bin
0,10th & Page MF,16820.0,3,1,23.163706,23.163706,Medium
2,190th Street Corridor,31084.0,3,18,32.272375,580.90275,Medium
4,217 Corridor/Beaverton,38900.0,2,52,20.963637,1090.109124,Low
9,385 Corridor,32820.0,3,48,21.026769,1009.284912,Medium
12,436 Corridor,36740.0,2,114,22.507988,2565.910632,Low


#### Check rent bucket
#### Compare the rent of the building rating id (Quality) for the submarket with the cutoff rent as below and determine bucket

In [145]:
revenue_bin = density.copy()
revenue_bin_high = revenue_bin[revenue_bin['rating_bin'] == 'High']

revenue_bin_high['Rent_bin'] = revenue_bin_high['Rent'].\
        apply(lambda x: 'High Quality Expensive' if x > 30 else 'High Quality Less Expensive')
revenue_bin_high['Rent_bin'].value_counts()

High Quality Expensive         47
High Quality Less Expensive    16
Name: Rent_bin, dtype: int64

In [146]:
revenue_bin_med = revenue_bin[revenue_bin['rating_bin'] == 'Medium']

revenue_bin_med['Rent_bin'] = revenue_bin_med['Rent'].\
        apply(lambda x: 'Medium Quality Expensive' if x > 22 else 'Medium Quality Less Expensive')
revenue_bin_med['Rent_bin'].value_counts()

Medium Quality Expensive         561
Medium Quality Less Expensive    354
Name: Rent_bin, dtype: int64

In [147]:
revenue_bin_low = revenue_bin[revenue_bin['rating_bin'] == 'Low']

revenue_bin_low['Rent_bin'] = revenue_bin_low['Rent'].\
        apply(lambda x: 'Low Quality Expensive' if x > 18 else 'Low Quality Less Expensive')
revenue_bin_low['Rent_bin'].value_counts()

Low Quality Less Expensive    1474
Low Quality Expensive         1220
Name: Rent_bin, dtype: int64

### Merge all in one cluster file

In [148]:
cluster = revenue_bin_high.append(revenue_bin_med)
cluster=cluster.append(revenue_bin_low)
cluster.head()

Unnamed: 0,submarket_name,cbsaid,building_rating_id,Density,Rent,Revenue,rating_bin,Rent_bin
406,Ballston,47900.0,4,20,41.50179,830.0358,High,High Quality Expensive
548,Bellevue CBD,42660.0,4,18,56.28976,1013.21568,High,High Quality Expensive
634,Birmingham-Hoover,13820.0,4,6,199.075451,1194.452707,High,High Quality Expensive
640,Bishop Ranch,36084.0,4,15,38.643143,579.647145,High,High Quality Expensive
814,Brickell,33124.0,4,13,51.369175,667.799275,High,High Quality Expensive


### Download the file as csv

In [128]:
# cluster.to_csv('cluster_submarket_Nov22.csv')