## DATA CLEANING AND IMPUTATION

In [1]:
import pandas as pd
import datetime
import numpy as np
pd.set_option("display.max_columns",999)
pd.set_option("display.max_rows",999)
import warnings
warnings.filterwarnings('ignore')

In [2]:
lease = pd.read_csv('lease_sept.csv', low_memory=False, 
                    keep_default_na = True, index = False).drop(columns='Unnamed: 0')  #data same from rds

In [3]:
lease.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
leasedeal_id,1067141.0,91279850.0,40859300.0,10000020.0,70106940.0,111345700.0,113517100.0,187389000.0
property_id,1067141.0,1271110.0,2235648.0,1.0,178740.0,381673.0,798120.0,11605750.0
renewal,219610.0,0.7202814,0.4488619,0.0,0.0,1.0,1.0,1.0
sqft_min,589103.0,2848.921,5631.973,1.0,897.0,1514.0,2895.0,496984.0
sqft_max,589103.0,3150.56,6351.686,1.0,919.0,1600.0,3085.0,496984.0
days_on_market,589103.0,595.5989,680.8799,0.0,182.0,365.0,820.0,11506.0
actual_vacancy,589103.0,0.8752323,0.3304556,0.0,1.0,1.0,1.0,1.0
lease_term_inmonths,1067141.0,48.00054,39.82621,-395.0,24.0,36.0,60.0,1236.0
rate_actual,219743.0,22.38252,350.6006,0.0,12.0,16.2,22.0,84000.0
estimated_rent,928206.0,22.74745,10.93183,1.060077,16.26206,20.19501,25.84223,1222.861


### Drop lease_term_in_months and tenant_improvement rows with negative values

In [4]:
#Drop the rows with negative lease_term_in_months
lease = lease[~(lease['lease_term_inmonths'] < 0)]

#Drop the rows with negative values of tenant improvement allowance
lease = lease[~(lease['tenantimprovementallowancepersqft'] < 0)]

In [5]:
lease.shape

(1066721, 30)

### Drop rows with null cbsaid's

In [6]:
lease[lease['cbsaid'].isnull()].shape

(2110, 30)

In [7]:
#Drop the rows with null values of cbsaid
lease = lease[~(lease['cbsaid'].isnull())]

In [8]:
print(lease.shape)
lease.head(2)

(1064611, 30)


Unnamed: 0,leasedeal_id,property_id,renewal,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_inmonths,rate_actual,estimated_rent,service_type_id,property_type_id,location_occupancy_id,rba,tenantimprovementallowancepersqft,free_months,construction_year,buildingrating_id,researchmarket_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip
0,110498312,157648,,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0
1,30028220,76048,,,,,,,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,,11.0,5,30028220,,,,1977.0,3,,28140.0,Kansas City,MO-KS,,


In [9]:
lease.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
leasedeal_id,1064611.0,91205490.0,40866740.0,10000020.0,70103580.0,111339200.0,113513800.0,187389000.0
property_id,1064611.0,1258406.0,2218551.0,1.0,178612.0,381525.0,790736.0,11589210.0
renewal,219185.0,0.7199078,0.4490451,0.0,0.0,1.0,1.0,1.0
sqft_min,587983.0,2851.286,5636.611,1.0,898.0,1516.0,2898.0,496984.0
sqft_max,587983.0,3153.297,6356.943,1.0,920.0,1600.0,3088.0,496984.0
days_on_market,587983.0,595.7025,681.1077,0.0,182.0,365.0,820.0,11506.0
actual_vacancy,587983.0,0.8752192,0.3304705,0.0,1.0,1.0,1.0,1.0
lease_term_inmonths,1064611.0,48.0473,39.80225,0.0,24.0,36.0,60.0,1236.0
rate_actual,218534.0,22.44167,351.5678,0.0,12.0,16.2,22.0,84000.0
estimated_rent,926196.0,22.76293,10.93667,1.060077,16.2752,20.21144,25.85666,1222.861


In [10]:
lease.columns

Index(['leasedeal_id', 'property_id', 'renewal', 'sqft_min', 'sqft_max',
       'date_on_market', 'date_off_market', 'days_on_market', 'actual_vacancy',
       'from_date', 'to_date', 'lease_sign_date', 'lease_expiration_date',
       'lease_term_inmonths', 'rate_actual', 'estimated_rent',
       'service_type_id', 'property_type_id', 'location_occupancy_id', 'rba',
       'tenantimprovementallowancepersqft', 'free_months', 'construction_year',
       'buildingrating_id', 'researchmarket_name', 'cbsaid', 'cbsa_cities',
       'cbsa_states', 'submarket_name', 'zip'],
      dtype='object')

### Rename columns

In [11]:
#renaming column names
lease.columns = ['leasedeal_id', 'property_id', 'renewal', 'sqft_min', 'sqft_max',
       'date_on_market', 'date_off_market', 'days_on_market', 'actual_vacancy',
       'from_date', 'to_date', 'lease_sign_date', 'lease_expiration_date',
       'lease_term_in_months', 'rate_actual', 'estimated_rent',
       'service_type_id', 'property_type_id', 'location_occupancy_id', 'rba',
       'tenant_improvement_allowance_persqft', 'free_months', 'construction_year',
       'building_rating_id', 'research_market_name', 'cbsaid', 'cbsa_cities',
       'cbsa_states', 'submarket_name', 'zip']

### Filling null submarket data using property id's

In [12]:
#check how many submarket_name are null
lease[lease['submarket_name'].isnull()].shape

(164742, 30)

In [13]:
#fill the null submarket rows using which same property id's have submarket values in them 
#and then filling nulls using the ones that have value in them
propertyid = lease.property_id[lease['submarket_name'].isnull()].unique()#getting all the prop where submarket is null

property_submarket = lease[lease['property_id'].isin(propertyid) & (lease['submarket_name'].notnull())][['property_id', 'submarket_name']].\
      groupby(['property_id', 'submarket_name']).size().reset_index()#size counts the number of rows

lease = pd.merge(lease, property_submarket, on='property_id', how='left')#creates two submarket columns from both df's
lease['submarket_name_x'].fillna(lease['submarket_name_y'], inplace=True)
lease.drop( ['submarket_name_y', 0], axis=1, inplace=True)

lease.rename(columns={'submarket_name_x': 'submarket_name'}, inplace=True)
lease[lease['submarket_name'].isnull()].shape
lease.head()

Unnamed: 0,leasedeal_id,property_id,renewal,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip
0,110498312,157648,,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0
1,30028220,76048,,,,,,,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,,11.0,5,30028220,,,,1977.0,3,,28140.0,Kansas City,MO-KS,Ward Parkway,
2,30187227,559441,,,,,,,,1986-01-15,2000-03-14,1985-12-16,2000-03-14,170.0,,,,5,30187227,,,,1985.0,2,,28140.0,Kansas City,MO-KS,I-29 Corridor,
3,114096677,239837,,1388.0,1388.0,2013-12-31,2014-03-31,90.0,1.0,2014-02-14,2015-02-12,2014-01-15,2015-02-13,12.0,,18.914058,7.0,5,122849533,1388.0,,,1972.0,2,Houston,26420.0,Houston-The Woodlands-Sugar Land,TX,Riverway,77024.0
4,10587417,220914,,,,,,,,2001-06-30,,2001-05-01,2006-06-30,60.0,,21.278033,,5,10587417,7200.0,,,1906.0,2,Boston,14460.0,Boston-Cambridge-Newton,MA-NH,Route 3 South,2382.0


In [14]:
#now lets see how many nulls are left
lease[lease['submarket_name'].isnull()].shape

(50480, 30)

### Filling null zip data using property id's

In [15]:
#Check how many zip are null
lease[lease['zip'].isnull()].shape

(144950, 30)

In [16]:
propertyid = lease.property_id[lease['zip'].isnull()].unique()

property_zip = lease[lease['property_id'].isin(propertyid) & (lease['zip'].notnull())][['property_id', 'zip']].\
    groupby(['property_id', 'zip']).size().reset_index()
property_zip

lease = pd.merge(lease, property_zip, on='property_id', how='left')
lease['zip_x'].fillna(lease['zip_y'], inplace=True)
lease.drop(['zip_y', 0], axis=1, inplace=True)
lease.rename(columns={'zip_x': 'zip'}, inplace=True)
lease[lease['zip'].isnull()].shape

(30022, 30)

### Filling null researchmarket_name data using property id's

In [17]:
lease[lease['research_market_name'].isnull()].shape

(144950, 30)

In [18]:
#fill the null researchmarket_name using which same property id's have researchmarket_name in them 
#and then filling nulls using the ones that have value in them
propertyid = lease.property_id[lease['research_market_name'].isnull()].unique()

property_research = lease[lease['property_id'].isin(propertyid) & (lease['research_market_name'].notnull())][['property_id', 'research_market_name']].\
    groupby(['property_id', 'research_market_name']).size().reset_index() #this gives a df with property id having research id
#Joining the two df's
lease = pd.merge(lease, property_research, on='property_id', how='left')
lease['research_market_name_x'].fillna(lease['research_market_name_y'], inplace=True)
lease.drop(['research_market_name_y', 0], axis=1, inplace=True)
lease.rename(columns={'research_market_name_x': 'research_market_name'}, inplace=True)
#Check how many null rows are left
lease[lease['research_market_name'].isnull()].shape

(30022, 30)

### Filling null construction_year data using property id's
Could not fill the values as there are no alternate values of contruction_year to fill values for same property_id's

In [19]:
lease['construction_year'].isnull().sum() #[['sqft_min', 'sqft_max', 'rba']]

16819

In [20]:
propertyid = lease.property_id[lease['construction_year'].isnull()].unique()#prop where const year is null

property_const = lease[lease['property_id'].isin(propertyid) & (lease['construction_year'].notnull())][['property_id', 'construction_year']].\
      groupby(['property_id', 'construction_year']).size().reset_index()
property_const

Unnamed: 0,property_id,construction_year,0


### Function to change from object type to time stamp

In [21]:
#function to change to timestamp format for calculations later
def standard_ts_format(df, col, dt_format):
    return pd.to_datetime(df[col], format=dt_format)

In [22]:
lease['date_on_market'] = standard_ts_format(lease, 'date_on_market', '%Y-%m-%d') 
lease['date_off_market'] = standard_ts_format(lease, 'date_off_market', '%Y-%m-%d')
 
lease['from_date'] = standard_ts_format(lease, 'from_date', '%Y-%m-%d')
lease['to_date'] = standard_ts_format(lease, 'to_date','%Y-%m-%d')
lease['lease_sign_date'] = standard_ts_format(lease, 'lease_sign_date','%Y-%m-%d')
lease['lease_expiration_date'] = standard_ts_format(lease, 'lease_expiration_date','%Y-%m-%d')

lease['currently_occupied_fl'] = lease[['to_date']].apply(lambda x: 'N' if (pd.notnull(x['to_date'])) else 'Y', axis=1)
lease.head(2)

Unnamed: 0,leasedeal_id,property_id,renewal,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl
0,110498312,157648,,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0,N
1,30028220,76048,,,,NaT,NaT,,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,Ward Parkway,64114.0,N


### Created three new rows to extract years

In [23]:
lease['year_on_market'] = pd.DatetimeIndex(lease['date_on_market']).year.astype('Int64')
lease['year_off_market'] = pd.DatetimeIndex(lease['date_off_market']).year.astype('Int64')
lease['days_on_market'] = (pd.DatetimeIndex(lease['date_off_market']) - pd.DatetimeIndex(lease['date_on_market'])).days


## Function to fill in missing values in from date, to date 
### (there were no nulls found in from_date but written function for scalability)

In [24]:
def check_alldates(fromdt,signdt, todt, expirationdt):
    #print(fromdt, todt,signdt, expirationdt)
    delete_rec = 'N' #
    new_fromdt, new_todt = '', '' #set them blanks..spaces
    #print(fromdt, todt, expirationdt)
    
    if fromdt is None:
        if signdt is None:
            delete_rec = 'Y'
        else:
            
            new_fromdt = signdt.date()
    else:
        new_fromdt = fromdt.date()

    if delete_rec == 'N':#I dont want to check the rows I already flagged with not delete
        if todt is None: 
            if expirationdt is None:
                delete_rec = 'Y'
        else:
            new_todt = todt
        
    #print(new_fromdt, new_todt)
    
    if delete_rec == 'N':
        if new_fromdt > new_todt:
            delete_rec = 'Y'   
    return delete_rec, new_fromdt, new_todt

In [25]:
lease_clean = lease.copy()
lease_clean[['delete_rec', 'new_from_date', 'new_to_date']] = \
    lease_clean[['from_date', 'lease_sign_date', 'to_date', 'lease_expiration_date']].apply(lambda x: \
                        check_alldates(x.from_date, x.lease_sign_date, x.to_date, x.lease_expiration_date), axis=1, \
                                                     result_type ='expand')#calling function #expand is to save it in a df
lease_clean['year_from'] = pd.DatetimeIndex(lease_clean['new_from_date']).year.astype('Int64') #change list like result into column
lease_clean['year_to'] = pd.DatetimeIndex(lease_clean['new_to_date']).year.astype('Int64')#to get the years occupied
lease_clean['years_occupied'] = lease_clean['year_to'] - lease_clean['year_from']


In [26]:
lease_clean[lease_clean['years_occupied'] < 0]

Unnamed: 0,leasedeal_id,property_id,renewal,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,year_on_market,year_off_market,delete_rec,new_from_date,new_to_date,year_from,year_to,years_occupied


In [27]:
#Checking which rows can be gotten rid of
lease_clean[lease_clean['delete_rec'] == 'Y']

Unnamed: 0,leasedeal_id,property_id,renewal,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,year_on_market,year_off_market,delete_rec,new_from_date,new_to_date,year_from,year_to,years_occupied
11902,70086006,382083,,,,NaT,NaT,,,2005-04-13,2005-04-12,2005-03-14,2006-04-30,60.0,,,7.0,5,70086006,1950.0,,,1972.0,3,,45300.0,Tampa-St. Petersburg-Clearwater,FL,,,N,,,Y,2005-04-13,2005-04-12,2005,2005,0
14536,70172053,227809,,,,NaT,NaT,,,2005-04-13,2005-04-12,2005-03-14,2006-04-30,36.0,,,7.0,5,70172053,1326.0,,,1976.0,3,,38060.0,Phoenix-Mesa-Scottsdale,AZ,,,N,,,Y,2005-04-13,2005-04-12,2005,2005,0
14824,70084969,93423,,,,NaT,NaT,,,2005-04-13,2005-04-12,2005-03-14,2010-04-30,60.0,,16.360279,7.0,5,70084969,1763.0,,,1930.0,3,Cleveland,17460.0,Cleveland-Elyria,OH,CBD,44113.0,N,,,Y,2005-04-13,2005-04-12,2005,2005,0
22296,70192751,279348,,,,NaT,NaT,,,2005-04-13,2005-04-12,2005-03-14,2008-04-30,120.0,,32.075497,7.0,5,70192751,2004.0,,,1989.0,4,Orange County (California),11244.0,Anaheim-Santa Ana-Irvine,CA,Newport Beach,92660.0,N,,,Y,2005-04-13,2005-04-12,2005,2005,0
25409,70472278,225170,,,,NaT,NaT,,,2005-04-13,2005-04-12,2005-03-14,2008-04-30,60.0,,17.023295,7.0,5,70472278,273.0,,,1968.0,2,Phoenix,38060.0,Phoenix-Mesa-Scottsdale,AZ,Tempe,85282.0,N,,,Y,2005-04-13,2005-04-12,2005,2005,0
80429,70317868,674727,,,,NaT,NaT,,,2005-04-13,2005-04-12,2005-03-14,2010-04-30,60.0,,24.418727,7.0,5,70317868,723.0,,,2002.0,3,Phoenix,38060.0,Phoenix-Mesa-Scottsdale,AZ,Scottsdale Airpark,85260.0,N,,,Y,2005-04-13,2005-04-12,2005,2005,0
80915,70333238,743163,,,,NaT,NaT,,,2005-04-13,2005-04-12,2005-03-14,2008-04-30,60.0,,28.778194,7.0,5,70333238,625.0,,,2004.0,4,Los Angeles,31084.0,Los Angeles-Long Beach-Glendale,CA,Santa Clarita Valley,91381.0,N,,,Y,2005-04-13,2005-04-12,2005,2005,0
84180,70428048,389131,,,,NaT,NaT,,,2005-04-13,2005-04-12,2005-03-14,2008-04-30,36.0,,,7.0,5,70428048,275.0,,,1972.0,2,,33124.0,Miami-Miami Beach-Kendall,FL,,,N,,,Y,2005-04-13,2005-04-12,2005,2005,0
92890,70174923,47752,,,,NaT,NaT,,,2005-04-13,2005-04-12,2005-03-14,2006-04-30,60.0,,18.834983,9.0,5,70174923,920.0,,,1980.0,2,Los Angeles,31084.0,Los Angeles-Long Beach-Glendale,CA,Western SFV,91306.0,N,,,Y,2005-04-13,2005-04-12,2005,2005,0
94326,70359704,507186,,,,NaT,NaT,,,2005-04-13,2005-04-12,2005-03-14,2008-04-30,60.0,,21.282539,7.0,5,70359704,2058.0,,,2003.0,4,Phoenix,38060.0,Phoenix-Mesa-Scottsdale,AZ,West I-10,85395.0,N,,,Y,2005-04-13,2005-04-12,2005,2005,0


### Update datatypes of construction_year, cbsaid and years_occupied

In [28]:
lease_clean['construction_year'] = lease_clean['construction_year'].astype('object')
lease_clean['cbsaid'] = lease_clean['cbsaid'].astype('object')
lease_clean['years_occupied'] = lease_clean['years_occupied'].astype('Int64')

In [29]:
lease_clean.shape

(1064611, 39)

In [30]:
#selecting rows that are to be kept and clean
lease_clean = lease_clean[lease_clean['delete_rec'] == 'N']
lease_clean.head(10)

Unnamed: 0,leasedeal_id,property_id,renewal,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,year_on_market,year_off_market,delete_rec,new_from_date,new_to_date,year_from,year_to,years_occupied
0,110498312,157648,,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0,N,2006.0,2006.0,N,2006-10-05,2014-04-29,2006,2014.0,8.0
1,30028220,76048,,,,NaT,NaT,,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,Ward Parkway,64114.0,N,,,N,1982-01-15,2001-04-15,1982,2001.0,19.0
2,30187227,559441,,,,NaT,NaT,,,1986-01-15,2000-03-14,1985-12-16,2000-03-14,170.0,,,,5,30187227,,,,1985.0,2,Kansas City,28140.0,Kansas City,MO-KS,I-29 Corridor,64152.0,N,,,N,1986-01-15,2000-03-14,1986,2000.0,14.0
3,114096677,239837,,1388.0,1388.0,2013-12-31,2014-03-31,90.0,1.0,2014-02-14,2015-02-12,2014-01-15,2015-02-13,12.0,,18.914058,7.0,5,122849533,1388.0,,,1972.0,2,Houston,26420.0,Houston-The Woodlands-Sugar Land,TX,Riverway,77024.0,N,2013.0,2014.0,N,2014-02-14,2015-02-12,2014,2015.0,1.0
4,10587417,220914,,,,NaT,NaT,,,2001-06-30,NaT,2001-05-01,2006-06-30,60.0,,21.278033,,5,10587417,7200.0,,,1906.0,2,Boston,14460.0,Boston-Cambridge-Newton,MA-NH,Route 3 South,2382.0,Y,,,N,2001-06-30,NaT,2001,,
5,112542119,158280,,1748.0,1748.0,2011-09-30,2011-12-31,92.0,1.0,2011-12-01,2016-12-31,2011-11-01,2016-11-30,60.0,,43.79246,4.0,5,119795017,1748.0,,,1890.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Chelsea,10001.0,N,2011.0,2011.0,N,2011-12-01,2016-12-31,2011,2016.0,5.0
6,110870155,5432828,,4340.0,4340.0,2006-12-31,2007-06-30,181.0,1.0,2007-05-07,2015-07-18,2007-04-09,2012-05-01,60.0,,15.285945,13.0,5,112965808,4341.0,,1.0,1977.0,2,Other Market Areas,47940.0,Waterloo-Cedar Falls,IA,,50677.0,N,2006.0,2007.0,N,2007-05-07,2015-07-18,2007,2015.0,8.0
7,176410481,6123667,,,,NaT,NaT,,,2018-08-01,2019-07-31,NaT,2019-07-31,12.0,,,,5,276168081,1200.0,,,1972.0,2,Wichita,48620.0,Wichita,KS,Downtown,67214.0,N,,,N,2018-08-01,2019-07-31,2018,2019.0,1.0
8,112426408,1277351,,2262.0,2262.0,2009-03-31,2011-09-30,913.0,1.0,2008-09-14,2008-09-15,2011-08-22,2014-03-20,30.0,9.5,14.511821,13.0,5,119404081,2262.0,,,1981.0,2,Milwaukee/Madison,31540.0,Madison,WI,Northeast Madison,53704.0,N,2009.0,2011.0,N,2008-09-14,2008-09-15,2008,2008.0,0.0
9,70269244,455643,,,,NaT,NaT,,,2004-09-30,2005-04-12,2004-08-31,2007-10-31,60.0,,20.963555,4.0,5,70269244,1048.0,,,1985.0,3,Detroit,19820.0,Detroit-Warren-Dearborn,MI,Southfield,48034.0,N,,,N,2004-09-30,2005-04-12,2004,2005.0,1.0


In [31]:
#Looking at few columns
lease_clean[['from_date', 'lease_sign_date', 'to_date', 'lease_expiration_date', 'delete_rec', 'new_from_date', 'new_to_date',
            'currently_occupied_fl']]

Unnamed: 0,from_date,lease_sign_date,to_date,lease_expiration_date,delete_rec,new_from_date,new_to_date,currently_occupied_fl
0,2006-10-05,2006-09-05,2014-04-29,2011-10-05,N,2006-10-05,2014-04-29,N
1,1982-01-15,1981-12-16,2001-04-15,2000-04-14,N,1982-01-15,2001-04-15,N
2,1986-01-15,1985-12-16,2000-03-14,2000-03-14,N,1986-01-15,2000-03-14,N
3,2014-02-14,2014-01-15,2015-02-12,2015-02-13,N,2014-02-14,2015-02-12,N
4,2001-06-30,2001-05-01,NaT,2006-06-30,N,2001-06-30,NaT,Y
...,...,...,...,...,...,...,...,...
1064606,2009-01-01,2008-11-18,2016-11-30,2014-01-01,N,2009-01-01,2016-11-30,N
1064607,1988-06-15,1988-05-16,1988-06-15,1997-12-14,N,1988-06-15,1988-06-15,N
1064608,2012-05-15,2012-04-15,NaT,2015-05-14,N,2012-05-15,NaT,Y
1064609,2002-02-01,2002-01-01,2002-03-04,2003-01-31,N,2002-02-01,2002-03-04,N


### Filling the remaining null submarket_name values with cbsa_cities values

In [32]:
lease_clean['submarket_name'].fillna(lease_clean['cbsa_cities'], inplace=True)

In [33]:
#Checking the maximum days_on_market value
np.nanmax(lease_clean[lease_clean['days_on_market'].notnull()]['days_on_market'])

11506.0

In [34]:
lease_clean.shape

(1064496, 39)

In [35]:
len(pd.unique(lease_clean.property_id))

175352

In [36]:
lease_clean.columns

Index(['leasedeal_id', 'property_id', 'renewal', 'sqft_min', 'sqft_max',
       'date_on_market', 'date_off_market', 'days_on_market', 'actual_vacancy',
       'from_date', 'to_date', 'lease_sign_date', 'lease_expiration_date',
       'lease_term_in_months', 'rate_actual', 'estimated_rent',
       'service_type_id', 'property_type_id', 'location_occupancy_id', 'rba',
       'tenant_improvement_allowance_persqft', 'free_months',
       'construction_year', 'building_rating_id', 'research_market_name',
       'cbsaid', 'cbsa_cities', 'cbsa_states', 'submarket_name', 'zip',
       'currently_occupied_fl', 'year_on_market', 'year_off_market',
       'delete_rec', 'new_from_date', 'new_to_date', 'year_from', 'year_to',
       'years_occupied'],
      dtype='object')

### Calculate actual_estimated_rent_ratio

In [37]:
lease_clean['actual_esti_rent_ratio'] = lease_clean['rate_actual'] / lease_clean['estimated_rent']

In [38]:
lease_clean.head(2)

Unnamed: 0,leasedeal_id,property_id,renewal,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,year_on_market,year_off_market,delete_rec,new_from_date,new_to_date,year_from,year_to,years_occupied,actual_esti_rent_ratio
0,110498312,157648,,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0,N,2006.0,2006.0,N,2006-10-05,2014-04-29,2006,2014,8,
1,30028220,76048,,,,NaT,NaT,,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,Ward Parkway,64114.0,N,,,N,1982-01-15,2001-04-15,1982,2001,19,


### Function to change into timestamp for calculations

In [39]:
def standard_ts_format(df, col, dt_format):
    return pd.to_datetime(df[col], format=dt_format)
lease_clean['date_on_market'] = standard_ts_format(lease_clean, 'date_on_market', '%Y-%m-%d') 
lease_clean['date_off_market'] = standard_ts_format(lease_clean, 'date_off_market', '%Y-%m-%d')
 
lease_clean['new_from_date'] = standard_ts_format(lease_clean, 'from_date', '%Y-%m-%d')
lease_clean['new_to_date'] = standard_ts_format(lease_clean, 'to_date','%Y-%m-%d')


### Creating variables occupied_months and vacant_months

In [40]:
lease_clean['occupied_months'] = (lease_clean['new_to_date'].dt.year - lease_clean['new_from_date'].dt.year) * 12 + \
        (lease_clean['new_to_date'].dt.month - lease_clean['new_from_date'].dt.month)

lease_clean['vacant_months'] = (lease_clean['date_off_market'].dt.year - lease_clean['date_on_market'].dt.year) * 12 + \
        (lease_clean['date_off_market'].dt.month - lease_clean['date_on_market'].dt.month)


In [41]:
lease_clean.columns

Index(['leasedeal_id', 'property_id', 'renewal', 'sqft_min', 'sqft_max',
       'date_on_market', 'date_off_market', 'days_on_market', 'actual_vacancy',
       'from_date', 'to_date', 'lease_sign_date', 'lease_expiration_date',
       'lease_term_in_months', 'rate_actual', 'estimated_rent',
       'service_type_id', 'property_type_id', 'location_occupancy_id', 'rba',
       'tenant_improvement_allowance_persqft', 'free_months',
       'construction_year', 'building_rating_id', 'research_market_name',
       'cbsaid', 'cbsa_cities', 'cbsa_states', 'submarket_name', 'zip',
       'currently_occupied_fl', 'year_on_market', 'year_off_market',
       'delete_rec', 'new_from_date', 'new_to_date', 'year_from', 'year_to',
       'years_occupied', 'actual_esti_rent_ratio', 'occupied_months',
       'vacant_months'],
      dtype='object')

In [42]:
lease_clean.head()

Unnamed: 0,leasedeal_id,property_id,renewal,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,year_on_market,year_off_market,delete_rec,new_from_date,new_to_date,year_from,year_to,years_occupied,actual_esti_rent_ratio,occupied_months,vacant_months
0,110498312,157648,,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0,N,2006.0,2006.0,N,2006-10-05,2014-04-29,2006,2014.0,8.0,,90.0,9.0
1,30028220,76048,,,,NaT,NaT,,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,Ward Parkway,64114.0,N,,,N,1982-01-15,2001-04-15,1982,2001.0,19.0,,231.0,
2,30187227,559441,,,,NaT,NaT,,,1986-01-15,2000-03-14,1985-12-16,2000-03-14,170.0,,,,5,30187227,,,,1985.0,2,Kansas City,28140.0,Kansas City,MO-KS,I-29 Corridor,64152.0,N,,,N,1986-01-15,2000-03-14,1986,2000.0,14.0,,170.0,
3,114096677,239837,,1388.0,1388.0,2013-12-31,2014-03-31,90.0,1.0,2014-02-14,2015-02-12,2014-01-15,2015-02-13,12.0,,18.914058,7.0,5,122849533,1388.0,,,1972.0,2,Houston,26420.0,Houston-The Woodlands-Sugar Land,TX,Riverway,77024.0,N,2013.0,2014.0,N,2014-02-14,2015-02-12,2014,2015.0,1.0,,12.0,3.0
4,10587417,220914,,,,NaT,NaT,,,2001-06-30,NaT,2001-05-01,2006-06-30,60.0,,21.278033,,5,10587417,7200.0,,,1906.0,2,Boston,14460.0,Boston-Cambridge-Newton,MA-NH,Route 3 South,2382.0,Y,,,N,2001-06-30,NaT,2001,,,,,


### Lease without downtime data

In [43]:
#Include columns we need for this dataset
lease_without_downtime= lease_clean[['leasedeal_id', 'property_id', 'renewal',
       'from_date', 'to_date', 'lease_sign_date', 'lease_expiration_date',
       'lease_term_in_months', 'rate_actual', 'estimated_rent', 'actual_esti_rent_ratio',
       'service_type_id', 'property_type_id', 'location_occupancy_id', 'rba',
       'tenant_improvement_allowance_persqft', 'free_months', 'construction_year',
       'building_rating_id', 'research_market_name', 'cbsaid', 'cbsa_cities',
       'cbsa_states', 'submarket_name', 'zip', 'currently_occupied_fl',
     'new_from_date','new_to_date', 'year_from', 'year_to',
       'years_occupied', 'occupied_months']]

lease_without_downtime=lease_without_downtime.drop_duplicates()
print(lease_without_downtime.shape)
lease_without_downtime.head(2)


(1054868, 32)


Unnamed: 0,leasedeal_id,property_id,renewal,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,new_from_date,new_to_date,year_from,year_to,years_occupied,occupied_months
0,110498312,157648,,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0,N,2006-10-05,2014-04-29,2006,2014,8,90.0
1,30028220,76048,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,Ward Parkway,64114.0,N,1982-01-15,2001-04-15,1982,2001,19,231.0


In [44]:
lease_without_downtime.groupby('leasedeal_id').agg({'property_id':'count'}).reset_index().sort_values(by='property_id',
                                                                                ascending=False)[:100]


Unnamed: 0,leasedeal_id,property_id
0,10000015,1
703203,112409075,1
703237,112409275,1
703238,112409284,1
703239,112409291,1
703240,112409293,1
703241,112409295,1
703242,112409296,1
703243,112409297,1
703244,112409304,1


### Lease with downtime

In [45]:
lease_clean.columns

Index(['leasedeal_id', 'property_id', 'renewal', 'sqft_min', 'sqft_max',
       'date_on_market', 'date_off_market', 'days_on_market', 'actual_vacancy',
       'from_date', 'to_date', 'lease_sign_date', 'lease_expiration_date',
       'lease_term_in_months', 'rate_actual', 'estimated_rent',
       'service_type_id', 'property_type_id', 'location_occupancy_id', 'rba',
       'tenant_improvement_allowance_persqft', 'free_months',
       'construction_year', 'building_rating_id', 'research_market_name',
       'cbsaid', 'cbsa_cities', 'cbsa_states', 'submarket_name', 'zip',
       'currently_occupied_fl', 'year_on_market', 'year_off_market',
       'delete_rec', 'new_from_date', 'new_to_date', 'year_from', 'year_to',
       'years_occupied', 'actual_esti_rent_ratio', 'occupied_months',
       'vacant_months'],
      dtype='object')

In [46]:
lease_with_downtime = lease_clean[['leasedeal_id', 'property_id', 'renewal', 'sqft_min', 'sqft_max',
       'date_on_market', 'date_off_market', 'days_on_market', 'actual_vacancy',
       'lease_term_in_months', 'rate_actual', 'estimated_rent', 'actual_esti_rent_ratio',
       'service_type_id', 'property_type_id', 'location_occupancy_id', 'rba',
       'tenant_improvement_allowance_persqft', 'free_months', 'construction_year',
       'building_rating_id', 'research_market_name', 'cbsaid', 'cbsa_cities',
       'cbsa_states', 'submarket_name', 'zip', 'currently_occupied_fl',
       'year_on_market', 'year_off_market', 
       'vacant_months']]
lease_with_downtime = lease_with_downtime.drop_duplicates()

In [47]:
lease_with_downtime.shape

(1064386, 31)

In [48]:
#Drop the number of null columns
lease_with_downtime = lease_with_downtime[~(lease_with_downtime['date_off_market'].isnull())]

In [49]:
lease_with_downtime.date_on_market.isnull().sum()

0

In [50]:
lease_with_downtime.shape

(587873, 31)

In [51]:
lease_with_downtime.groupby('leasedeal_id').agg({'property_id':'count'}).reset_index().sort_values(by='property_id',
                                                                                ascending=False)[:900]

Unnamed: 0,leasedeal_id,property_id
324160,112668627,19
379992,113666086,11
267456,112234470,10
441386,114433640,10
479832,118058441,9
479825,118057811,9
27060,110198517,9
394802,113817180,8
382004,113686242,8
571075,165745721,8


In [52]:
lease_with_downtime.columns

Index(['leasedeal_id', 'property_id', 'renewal', 'sqft_min', 'sqft_max',
       'date_on_market', 'date_off_market', 'days_on_market', 'actual_vacancy',
       'lease_term_in_months', 'rate_actual', 'estimated_rent',
       'actual_esti_rent_ratio', 'service_type_id', 'property_type_id',
       'location_occupancy_id', 'rba', 'tenant_improvement_allowance_persqft',
       'free_months', 'construction_year', 'building_rating_id',
       'research_market_name', 'cbsaid', 'cbsa_cities', 'cbsa_states',
       'submarket_name', 'zip', 'currently_occupied_fl', 'year_on_market',
       'year_off_market', 'vacant_months'],
      dtype='object')

In [53]:
#Dropping duplicate leasedealid's
lease_with_downtime.drop_duplicates(subset='leasedeal_id', keep = False, inplace = True)
lease_with_downtime

Unnamed: 0,leasedeal_id,property_id,renewal,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,year_on_market,year_off_market,vacant_months
0,110498312,157648,,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,60.0,,39.202936,,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0,N,2006,2006,9.0
3,114096677,239837,,1388.0,1388.0,2013-12-31,2014-03-31,90.0,1.0,12.0,,18.914058,,7.0,5,122849533,1388.0,,,1972.0,2,Houston,26420.0,Houston-The Woodlands-Sugar Land,TX,Riverway,77024.0,N,2013,2014,3.0
5,112542119,158280,,1748.0,1748.0,2011-09-30,2011-12-31,92.0,1.0,60.0,,43.792460,,4.0,5,119795017,1748.0,,,1890.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Chelsea,10001.0,N,2011,2011,3.0
6,110870155,5432828,,4340.0,4340.0,2006-12-31,2007-06-30,181.0,1.0,60.0,,15.285945,,13.0,5,112965808,4341.0,,1.0,1977.0,2,Other Market Areas,47940.0,Waterloo-Cedar Falls,IA,Waterloo-Cedar Falls,50677.0,N,2006,2007,6.0
8,112426408,1277351,,2262.0,2262.0,2009-03-31,2011-09-30,913.0,1.0,30.0,9.5,14.511821,0.654639,13.0,5,119404081,2262.0,,,1981.0,2,Milwaukee/Madison,31540.0,Madison,WI,Northeast Madison,53704.0,N,2009,2011,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1064601,110794275,248465,,3708.0,3708.0,2007-06-30,2007-09-30,92.0,1.0,36.0,,19.301398,,7.0,5,112788389,3720.0,,1.0,1958.0,3,Los Angeles,31084.0,Los Angeles-Long Beach-Glendale,CA,Park Mile,90010.0,N,2007,2007,3.0
1064605,119754101,7314056,,2032.0,2032.0,2009-12-31,2015-12-31,2191.0,1.0,12.0,,14.170486,,13.0,5,134605381,2032.0,,,2000.0,1,Other Market Areas,27420.0,Jamestown,ND,Jamestown,58401.0,N,2009,2015,72.0
1064606,111384471,300227,,1750.0,1750.0,2008-06-30,2009-03-31,274.0,1.0,60.0,,21.513206,,4.0,5,114180680,1750.0,,,1965.0,3,Philadelphia,37980.0,Philadelphia-Camden-Wilmington,PA-NJ-DE-MD,Market Street West,19102.0,N,2008,2009,9.0
1064608,113201416,4157322,,2500.0,2500.0,2011-06-30,2012-06-30,366.0,1.0,36.0,15.0,15.071205,0.995275,6.0,5,120804094,2500.0,,,1923.0,2,Chicago,16980.0,Chicago-Naperville-Elgin,IL-IN-WI,Fulton Market/Near West Side,60607.0,Y,2011,2012,12.0


In [54]:
print(lease_with_downtime.shape )

(570371, 31)


In [55]:
lease_with_downtime_v1 = lease_with_downtime[['leasedeal_id', 'sqft_min', 'sqft_max', 'date_on_market', 'date_off_market',
       'days_on_market', 'actual_vacancy', 'year_on_market', 'year_off_market',
       'vacant_months']]
lease_clean_oct27 = pd.merge(lease_without_downtime,lease_with_downtime_v1, on='leasedeal_id', how='outer')
lease_clean_oct27


Unnamed: 0,leasedeal_id,property_id,renewal,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,new_from_date,new_to_date,year_from,year_to,years_occupied,occupied_months,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,year_on_market,year_off_market,vacant_months
0,110498312,157648,,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0,N,2006-10-05,2014-04-29,2006,2014,8,90.0,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006,2006,9.0
1,30028220,76048,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,Ward Parkway,64114.0,N,1982-01-15,2001-04-15,1982,2001,19,231.0,,,NaT,NaT,,,,,
2,30187227,559441,,1986-01-15,2000-03-14,1985-12-16,2000-03-14,170.0,,,,,5,30187227,,,,1985.0,2,Kansas City,28140.0,Kansas City,MO-KS,I-29 Corridor,64152.0,N,1986-01-15,2000-03-14,1986,2000,14,170.0,,,NaT,NaT,,,,,
3,114096677,239837,,2014-02-14,2015-02-12,2014-01-15,2015-02-13,12.0,,18.914058,,7.0,5,122849533,1388.0,,,1972.0,2,Houston,26420.0,Houston-The Woodlands-Sugar Land,TX,Riverway,77024.0,N,2014-02-14,2015-02-12,2014,2015,1,12.0,1388.0,1388.0,2013-12-31,2014-03-31,90.0,1.0,2013,2014,3.0
4,10587417,220914,,2001-06-30,NaT,2001-05-01,2006-06-30,60.0,,21.278033,,,5,10587417,7200.0,,,1906.0,2,Boston,14460.0,Boston-Cambridge-Newton,MA-NH,Route 3 South,2382.0,Y,2001-06-30,NaT,2001,,,,,,NaT,NaT,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1054863,111384471,300227,,2009-01-01,2016-11-30,2008-11-18,2014-01-01,60.0,,21.513206,,4.0,5,114180680,1750.0,,,1965.0,3,Philadelphia,37980.0,Philadelphia-Camden-Wilmington,PA-NJ-DE-MD,Market Street West,19102.0,N,2009-01-01,2016-11-30,2009,2016,7,94.0,1750.0,1750.0,2008-06-30,2009-03-31,274.0,1.0,2008,2009,9.0
1054864,30011451,61383,,1988-06-15,1988-06-15,1988-05-16,1997-12-14,114.0,,,,11.0,5,30011451,,,,1902.0,4,Pittsburgh,38300.0,Pittsburgh,PA,CBD,15219.0,N,1988-06-15,1988-06-15,1988,1988,0,0.0,,,NaT,NaT,,,,,
1054865,113201416,4157322,,2012-05-15,NaT,2012-04-15,2015-05-14,36.0,15.0,15.071205,0.995275,6.0,5,120804094,2500.0,,,1923.0,2,Chicago,16980.0,Chicago-Naperville-Elgin,IL-IN-WI,Fulton Market/Near West Side,60607.0,Y,2012-05-15,NaT,2012,,,,2500.0,2500.0,2011-06-30,2012-06-30,366.0,1.0,2011,2012,12.0
1054866,10541376,303542,,2002-02-01,2002-03-04,2002-01-01,2003-01-31,12.0,,32.398496,,13.0,5,10541376,2395.0,,,1964.0,2,South Bay/San Jose,41940.0,San Jose-Sunnyvale-Santa Clara,CA,Los Gatos/Saratoga,95030.0,N,2002-02-01,2002-03-04,2002,2002,0,1.0,,,NaT,NaT,,,,,


In [56]:
lease_clean_oct27[lease_clean_oct27['cbsaid'].isnull()]

Unnamed: 0,leasedeal_id,property_id,renewal,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,new_from_date,new_to_date,year_from,year_to,years_occupied,occupied_months,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,year_on_market,year_off_market,vacant_months


### Creating a new column cbsa_state_new which will contain single state

In [57]:
lease_clean_oct27['cbsa_state_new'] = lease_clean_oct27['cbsa_states']

lease_clean_oct27.loc[lease_clean_oct27.zip == 98607, 'cbsa_state_new'] = "WA"
lease_clean_oct27.loc[lease_clean_oct27.zip == 98660, 'cbsa_state_new'] = "WA"
lease_clean_oct27.loc[lease_clean_oct27.zip == 98661, 'cbsa_state_new'] = "WA"
lease_clean_oct27.loc[lease_clean_oct27.zip == 98662, 'cbsa_state_new'] = "WA"
lease_clean_oct27.loc[lease_clean_oct27.zip == 98663, 'cbsa_state_new'] = "WA"
lease_clean_oct27.loc[lease_clean_oct27.zip == 98664, 'cbsa_state_new'] = "WA"
lease_clean_oct27.loc[lease_clean_oct27.zip == 98665, 'cbsa_state_new'] = "WA"
lease_clean_oct27.loc[lease_clean_oct27.zip == 98682, 'cbsa_state_new'] = "WA"
lease_clean_oct27.loc[lease_clean_oct27.zip == 98683, 'cbsa_state_new'] = "WA"
lease_clean_oct27.loc[lease_clean_oct27.zip == 98684, 'cbsa_state_new'] = "WA"
lease_clean_oct27.loc[lease_clean_oct27.zip == 98687, 'cbsa_state_new'] = "WA"

In [58]:
#NY-NJ change the cbsa_state of the Jersey City zip codes to NJ
#had to drop the leading "0"s from the zipcodes

lease_clean_oct27.loc[lease_clean_oct27.zip == 7030, 'cbsa_state_new'] = "NJ"
lease_clean_oct27.loc[lease_clean_oct27.zip == 7032, 'cbsa_state_new'] = "NJ"
lease_clean_oct27.loc[lease_clean_oct27.zip == 7087, 'cbsa_state_new'] = "NJ"
lease_clean_oct27.loc[lease_clean_oct27.zip == 7094, 'cbsa_state_new'] = "NJ"
lease_clean_oct27.loc[lease_clean_oct27.zip == 7302, 'cbsa_state_new'] = "NJ"
lease_clean_oct27.loc[lease_clean_oct27.zip == 7304, 'cbsa_state_new'] = "NJ"
lease_clean_oct27.loc[lease_clean_oct27.zip == 7305, 'cbsa_state_new'] = "NJ"
lease_clean_oct27.loc[lease_clean_oct27.zip == 7306, 'cbsa_state_new'] = "NJ"
lease_clean_oct27.loc[lease_clean_oct27.zip == 7307, 'cbsa_state_new'] = "NJ"
lease_clean_oct27.loc[lease_clean_oct27.zip == 7310, 'cbsa_state_new'] = "NJ"
lease_clean_oct27.loc[lease_clean_oct27.zip == 7311, 'cbsa_state_new'] = "NJ"

In [59]:
lease_clean_oct27.head()

Unnamed: 0,leasedeal_id,property_id,renewal,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,new_from_date,new_to_date,year_from,year_to,years_occupied,occupied_months,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,year_on_market,year_off_market,vacant_months,cbsa_state_new
0,110498312,157648,,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0,N,2006-10-05,2014-04-29,2006,2014.0,8.0,90.0,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006.0,2006.0,9.0,NY-NJ
1,30028220,76048,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,Ward Parkway,64114.0,N,1982-01-15,2001-04-15,1982,2001.0,19.0,231.0,,,NaT,NaT,,,,,,MO-KS
2,30187227,559441,,1986-01-15,2000-03-14,1985-12-16,2000-03-14,170.0,,,,,5,30187227,,,,1985.0,2,Kansas City,28140.0,Kansas City,MO-KS,I-29 Corridor,64152.0,N,1986-01-15,2000-03-14,1986,2000.0,14.0,170.0,,,NaT,NaT,,,,,,MO-KS
3,114096677,239837,,2014-02-14,2015-02-12,2014-01-15,2015-02-13,12.0,,18.914058,,7.0,5,122849533,1388.0,,,1972.0,2,Houston,26420.0,Houston-The Woodlands-Sugar Land,TX,Riverway,77024.0,N,2014-02-14,2015-02-12,2014,2015.0,1.0,12.0,1388.0,1388.0,2013-12-31,2014-03-31,90.0,1.0,2013.0,2014.0,3.0,TX
4,10587417,220914,,2001-06-30,NaT,2001-05-01,2006-06-30,60.0,,21.278033,,,5,10587417,7200.0,,,1906.0,2,Boston,14460.0,Boston-Cambridge-Newton,MA-NH,Route 3 South,2382.0,Y,2001-06-30,NaT,2001,,,,,,NaT,NaT,,,,,,MA-NH


In [60]:
#removed the leading space in the cbsa_states column
lease_clean_oct27.cbsa_state_new = lease_clean_oct27.cbsa_state_new.str.lstrip()

In [61]:
#keep first two characters of the multi states
lease_clean_oct27.cbsa_state_new = lease_clean_oct27.cbsa_state_new.str[:2]

In [62]:
lease_clean_oct27.shape

(1054868, 42)

In [63]:
lease_clean_oct27.head(2)

Unnamed: 0,leasedeal_id,property_id,renewal,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,submarket_name,zip,currently_occupied_fl,new_from_date,new_to_date,year_from,year_to,years_occupied,occupied_months,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,year_on_market,year_off_market,vacant_months,cbsa_state_new
0,110498312,157648,,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,Murray Hill,10016.0,N,2006-10-05,2014-04-29,2006,2014,8,90.0,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006.0,2006.0,9.0,NY
1,30028220,76048,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,Ward Parkway,64114.0,N,1982-01-15,2001-04-15,1982,2001,19,231.0,,,NaT,NaT,,,,,,MO


In [64]:
#Rearrange the columns
lease_clean_oct27 = lease_clean_oct27[['leasedeal_id', 'property_id', 'renewal', 'from_date', 'to_date',
       'lease_sign_date', 'lease_expiration_date', 'lease_term_in_months',
       'rate_actual', 'estimated_rent', 'actual_esti_rent_ratio',
       'service_type_id', 'property_type_id', 'location_occupancy_id', 'rba',
       'tenant_improvement_allowance_persqft', 'free_months',
       'construction_year', 'building_rating_id', 'research_market_name',
       'cbsaid', 'cbsa_cities', 'cbsa_states', 'cbsa_state_new', 'submarket_name', 'zip',
       'currently_occupied_fl', 'new_from_date', 'new_to_date', 'year_from',
       'year_to', 'years_occupied', 'occupied_months', 'sqft_min', 'sqft_max',
       'date_on_market', 'date_off_market', 'days_on_market', 'actual_vacancy',
       'year_on_market', 'year_off_market', 'vacant_months']]

In [65]:
lease_clean_oct27.head(1)

Unnamed: 0,leasedeal_id,property_id,renewal,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,cbsa_state_new,submarket_name,zip,currently_occupied_fl,new_from_date,new_to_date,year_from,year_to,years_occupied,occupied_months,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,year_on_market,year_off_market,vacant_months
0,110498312,157648,,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,NY,Murray Hill,10016.0,N,2006-10-05,2014-04-29,2006,2014,8,90.0,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006,2006,9.0


In [66]:
lease_clean_oct27.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
leasedeal_id,1054868.0,90975230.0,40965940.0,10000020.0,70090450.0,111316400.0,113500600.0,187389000.0
property_id,1054868.0,1256598.0,2217581.0,1.0,178507.0,381455.5,787691.0,11589210.0
renewal,214775.0,0.7200512,0.4489748,0.0,0.0,1.0,1.0,1.0
lease_term_in_months,1054868.0,48.1061,39.89575,0.0,24.0,36.0,60.0,1236.0
rate_actual,215738.0,22.49681,353.8347,0.0,12.0,16.2,22.0,84000.0
estimated_rent,916616.0,22.76925,10.94788,1.060077,16.27733,20.21465,25.86487,1222.861
actual_esti_rent_ratio,213635.0,1.098423,18.48203,0.0,0.7551587,0.8980032,1.009387,3344.603
service_type_id,821251.0,8.260764,2.825951,1.0,7.0,7.0,10.0,19.0
property_type_id,1054868.0,5.0,0.0,5.0,5.0,5.0,5.0,5.0
location_occupancy_id,1054868.0,99886170.0,53049820.0,10000020.0,70093350.0,114025200.0,121598100.0,284204600.0


In [67]:
#Check how many rows each leasedealid has
lease_clean_oct27.groupby('leasedeal_id').agg({'property_id':'count'}).reset_index().sort_values(by='property_id',
                                                                                ascending=False)[:900]

Unnamed: 0,leasedeal_id,property_id
0,10000015,1
703203,112409075,1
703237,112409275,1
703238,112409284,1
703239,112409291,1
703240,112409293,1
703241,112409295,1
703242,112409296,1
703243,112409297,1
703244,112409304,1


In [68]:
lease_clean_oct27.shape

(1054868, 42)

In [69]:
lease_clean_oct27.dtypes

leasedeal_id                                     int64
property_id                                      int64
renewal                                        float64
from_date                               datetime64[ns]
to_date                                 datetime64[ns]
lease_sign_date                         datetime64[ns]
lease_expiration_date                   datetime64[ns]
lease_term_in_months                           float64
rate_actual                                    float64
estimated_rent                                 float64
actual_esti_rent_ratio                         float64
service_type_id                                float64
property_type_id                                 int64
location_occupancy_id                            int64
rba                                            float64
tenant_improvement_allowance_persqft           float64
free_months                                    float64
construction_year                               object
building_r

### Not include rows where lease_term_in_months = 0

In [70]:
lease_clean_oct27 = lease_clean_oct27[~(lease_clean_oct27['lease_term_in_months']==0)]

In [71]:
lease_clean_oct27.shape

(1038594, 42)

### Impute Estimated rent column null values in 4 steps
Total null values are 128080

First use actual_rent to impute estimated_rent

Second by submarket_name and building_rating_id

Third by submarket_name

Fourth by cbsaid

In [72]:
lease_clean_oct27['estimated_rent'].isnull().sum()

128080

In [73]:
lease_clean_oct27['estimated_rent'].fillna(lease_clean_oct27['rate_actual'], inplace=True)

In [74]:
lease_clean_oct27['estimated_rent'].isnull().sum()

125979

In [75]:
#Impute estimated_rent using submarket_name and building_rating_id
mean_estimated_rent = lease_clean_oct27.groupby(['submarket_name', 'building_rating_id']).\
            agg({'estimated_rent': 'mean'}).reset_index()

estimated_rent_group = lease_clean_oct27[(lease_clean_oct27['estimated_rent'].isnull())][['leasedeal_id', 'submarket_name', 'cbsaid',
                                                                           'building_rating_id', 'estimated_rent']]
estimated_rent_merge = pd.merge(estimated_rent_group, mean_estimated_rent, \
                on=['submarket_name', 'building_rating_id'], how='left').drop(columns=['estimated_rent_x']).\
                rename(columns={'estimated_rent_y': 'estimated_rent'})

lease_clean_oct27 = pd.merge(lease_clean_oct27, estimated_rent_merge[['leasedeal_id', 'estimated_rent']], on='leasedeal_id',
                                                       how='left')
lease_clean_oct27['estimated_rent_x'].fillna(lease_clean_oct27.estimated_rent_y, inplace=True)
lease_clean_oct27 = lease_clean_oct27.drop(columns=['estimated_rent_y']).\
        rename(columns = {'estimated_rent_x': 'estimated_rent'})
lease_clean_oct27.head(2)


Unnamed: 0,leasedeal_id,property_id,renewal,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,cbsa_state_new,submarket_name,zip,currently_occupied_fl,new_from_date,new_to_date,year_from,year_to,years_occupied,occupied_months,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,year_on_market,year_off_market,vacant_months
0,110498312,157648,,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,NY,Murray Hill,10016.0,N,2006-10-05,2014-04-29,2006,2014,8,90.0,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006.0,2006.0,9.0
1,30028220,76048,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,16.819759,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,MO,Ward Parkway,64114.0,N,1982-01-15,2001-04-15,1982,2001,19,231.0,,,NaT,NaT,,,,,


In [76]:
lease_clean_oct27['estimated_rent'].isnull().sum()

905

In [77]:
#Impute the remaining 905 estimated rent null values using submarket_name
mean_estimated_rent = lease_clean_oct27.groupby(['submarket_name']).\
            agg({'estimated_rent': 'mean'}).reset_index()

estimated_rent_group = lease_clean_oct27[(lease_clean_oct27['estimated_rent'].isnull())][['leasedeal_id', 'submarket_name', 
                                                                        'cbsaid', 'estimated_rent']]
estimated_rent_merge = pd.merge(estimated_rent_group, mean_estimated_rent, \
                on=['submarket_name'], how='left').drop(columns=['estimated_rent_x']).\
                rename(columns={'estimated_rent_y': 'estimated_rent'})

lease_clean_oct27 = pd.merge(lease_clean_oct27, estimated_rent_merge[['leasedeal_id', 'estimated_rent']], on='leasedeal_id',
                                                       how='left')
lease_clean_oct27['estimated_rent_x'].fillna(lease_clean_oct27.estimated_rent_y, inplace=True)
lease_clean_oct27 = lease_clean_oct27.drop(columns=['estimated_rent_y']).\
        rename(columns = {'estimated_rent_x': 'estimated_rent'})
lease_clean_oct27.head(2)


Unnamed: 0,leasedeal_id,property_id,renewal,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,cbsa_state_new,submarket_name,zip,currently_occupied_fl,new_from_date,new_to_date,year_from,year_to,years_occupied,occupied_months,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,year_on_market,year_off_market,vacant_months
0,110498312,157648,,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,NY,Murray Hill,10016.0,N,2006-10-05,2014-04-29,2006,2014,8,90.0,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006.0,2006.0,9.0
1,30028220,76048,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,16.819759,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,MO,Ward Parkway,64114.0,N,1982-01-15,2001-04-15,1982,2001,19,231.0,,,NaT,NaT,,,,,


In [78]:
lease_clean_oct27[lease_clean_oct27['estimated_rent'].isnull()].shape

(116, 42)

In [79]:
#Finally impute the remaining 116 null estimated rent using cbsaid. We are left with 7 null rows after this which is ok
mean_estimated_rent = lease_clean_oct27.groupby(['cbsaid']).\
            agg({'estimated_rent': 'mean'}).reset_index()

estimated_rent_group = lease_clean_oct27[(lease_clean_oct27['estimated_rent'].isnull())][['leasedeal_id', 'cbsaid', 
                                                                        'estimated_rent']]
estimated_rent_merge = pd.merge(estimated_rent_group, mean_estimated_rent, \
                on=['cbsaid'], how='left').drop(columns=['estimated_rent_x']).\
                rename(columns={'estimated_rent_y': 'estimated_rent'})

lease_clean_oct27 = pd.merge(lease_clean_oct27, estimated_rent_merge[['leasedeal_id', 'estimated_rent']], on='leasedeal_id',
                                                       how='left')
lease_clean_oct27['estimated_rent_x'].fillna(lease_clean_oct27.estimated_rent_y, inplace=True)
lease_clean_oct27 = lease_clean_oct27.drop(columns=['estimated_rent_y']).\
        rename(columns = {'estimated_rent_x': 'estimated_rent'})
lease_clean_oct27.head(2)


Unnamed: 0,leasedeal_id,property_id,renewal,from_date,to_date,lease_sign_date,lease_expiration_date,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,rba,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,cbsa_state_new,submarket_name,zip,currently_occupied_fl,new_from_date,new_to_date,year_from,year_to,years_occupied,occupied_months,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,year_on_market,year_off_market,vacant_months
0,110498312,157648,,2006-10-05,2014-04-29,2006-09-05,2011-10-05,60.0,,39.202936,,4.0,5,111764312,3650.0,,,1928.0,3,New York City,35614.0,New York-Jersey City-White Plains,NY-NJ,NY,Murray Hill,10016.0,N,2006-10-05,2014-04-29,2006,2014,8,90.0,3650.0,3650.0,2006-03-31,2006-12-31,275.0,1.0,2006.0,2006.0,9.0
1,30028220,76048,,1982-01-15,2001-04-15,1981-12-16,2000-04-14,219.0,,16.819759,,11.0,5,30028220,,,,1977.0,3,Kansas City,28140.0,Kansas City,MO-KS,MO,Ward Parkway,64114.0,N,1982-01-15,2001-04-15,1982,2001,19,231.0,,,NaT,NaT,,,,,


In [80]:
lease_clean_oct27[lease_clean_oct27['estimated_rent'].isnull()].shape

(7, 42)

In [106]:
lease_clean_oct27.estimated_rent = lease_clean_oct27.estimated_rent.round(2)

lease_clean_oct27.actual_esti_rent_ratio = lease_clean_oct27.actual_esti_rent_ratio.round(2)

In [107]:
lease_clean_oct27[lease_clean_oct27['cbsaid']==35614.0]['estimated_rent'].mean()

40.52641706072185

In [108]:
lease_clean_oct27.shape

(1038594, 42)

In [109]:
lease_clean_oct27.dtypes

leasedeal_id                                     int64
property_id                                      int64
renewal                                        float64
from_date                               datetime64[ns]
to_date                                 datetime64[ns]
lease_sign_date                         datetime64[ns]
lease_expiration_date                   datetime64[ns]
lease_term_in_months                           float64
rate_actual                                    float64
estimated_rent                                 float64
actual_esti_rent_ratio                         float64
service_type_id                                float64
property_type_id                                 int64
location_occupancy_id                            int64
rba                                            float64
tenant_improvement_allowance_persqft           float64
free_months                                    float64
construction_year                               object
building_r