# Fitting Distributions 

In [1]:
#libraries

import pandas as pd
import datetime
import psycopg2
import numpy as np
import os
import seaborn as sns
pd.set_option("display.max_columns",999)
import seaborn as sns
from fitter import Fitter, get_common_distributions, get_distributions

import warnings
warnings.filterwarnings('ignore')

pd.options.display.float_format = '{:.2f}'.format
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

In [2]:
# paths

LOCAL = os.environ['LOCAL_REPOSITORY_LOCATION']
folder_dist = 'DAPT\cost\distributions'
path_dist = os.path.join(LOCAL, folder_dist)
fn = 'lease_clean_oct29.csv'

In [169]:
# read in data from RDS

def read_data():
    # reads in Oct 29 rds lease data
    
    conn = psycopg2.connect(
        host = 'lease-data.cnzawwknyviz.us-east-1.rds.amazonaws.com',
        port = 5432,
        user = 'costar',
        password = 'Costar12',
        database='costar'
        )
    cursor = conn.cursor() 

    try:
        df = pd.read_csv('downtime_lease_nov8.csv')
        
    except:
        q = '''
            SELECT
                *
            FROM downtime_lease_nov8
            '''
        df = pd.read_sql(q, 
                         con = conn)

    return df

dfm = read_data()

dfm.shape

(557607, 29)

In [35]:
# resources
'''
https://towardsdatascience.com/finding-the-best-
distribution-that-fits-your-data-using-pythons-
fitter-library-319a5a0972e9
'''

'\nhttps://towardsdatascience.com/finding-the-best-distribution-that-fits-your-data-using-pythons-fitter-library-319a5a0972e9\n\n\n'

#### Functions

In [5]:
def desc_stat_col(column):
    '''quick function that produces 
        descriptive stats on a chosen column'''

    pcts = [.01, .05, .10, .25, .5, .75, .90, .95, .99]

    df = dfm[column]
    
    print('DESCRIPTIVE STATS WITHOUT MODIFICATIONS:')
    print('\r')
    print(df.describe(percentiles = pcts).T)

In [6]:
def make_hist(column, bins, lower_pct, upper_pct, color):
    '''Produces descr stats, 
        removes lower & upper bounds (if desired),
        produces histogram'''
    
    df = dfm
    
    min_down = df.days_on_market.min()
    
    range_min = df[column].quantile(lower_pct)
    range_max = df[column].quantile(upper_pct)
    
    print('\r')
    print('\r')
    print(f'HISTOGRAM WITH LOWER & UPPER BOUNDS REMOVED, lower: {lower_pct*100}%, upper: {upper_pct*100}%')
    df[column].hist(bins = bins, range = (range_min, range_max), color = color)

In [7]:
def make_violin(column, color):

    sns.set_theme(style="whitegrid")
    ax = sns.violinplot(data = dfm, x=dfm[column], color = color, width = 2)   
    
# make_violin('days_on_market', 'red')
# make_violin('new_days_on_market', 'green')

In [8]:
def create_tag_column(column, lower_pct, upper_pct):
    
    new_column = 'new_' + column
    min_range = dfm[column].quantile(lower_pct)
    max_range = dfm[column].quantile(upper_pct)

    dfm[new_column] = dfm[column][(dfm[column] >= min_range) & (dfm[column] <= max_range)]
    print('\r')
    print(f'NEW COLUMN CREATED WITH UPPER & LOWER BOUNDS: {dfm.columns[-1]}')

In [33]:
'''DOWNTIME: 
    Rows above 95 percentile were eliminated
    for downtime per Rob's instruction; landlords
    often have buildings off market for major rennovations.'''

# parameters: column
desc_stat_col('vacant_months')

# parameters: column, bins, lower_pct, upper_pct, color
make_hist('vacant_months', 60, 0, .95, 'aqua')

# parameters: column, lower, upper 
create_tag_column('vacant_months', .95, 1)

# USE DOWNTIME W/O REMOVING OUTLIERS

In [54]:
# import fitter
# f = fitter.Fitter(dt)

# # just a trick to use only 10 distributions instead of 80 to speed up the fitting
# f.distributions = f.distributions[0:10] + ['lognorm']

# # fit and plot
# f.fit()
# f.summary()

### Downtime Geographical Heirarchy

In [254]:
df = dfm

In [261]:
df.shape

(557607, 29)

In [262]:
#strip whitespace
df.cbsa_state_new = df.cbsa_state_new.str.strip()

#replace '-' w/ '_'
df.cbsa_state_new = df.cbsa_state_new.replace('-', '_')

In [293]:
# https://stackoverflow.com/questions/30215677/how-to-create-a-data-frame-for-each-group-in-the-pandas-groupby-function
grouped = df.groupby('cbsa_state_new')

d = {}

for name, group in grouped:
    d[str(name)] = group
    
    

In [297]:
d['WY']

Unnamed: 0,leasedeal_id,property_id,lease_term_in_months,rate_actual,estimated_rent,actual_esti_rent_ratio,service_type_id,property_type_id,location_occupancy_id,tenant_improvement_allowance_persqft,free_months,construction_year,building_rating_id,research_market_name,cbsaid,cbsa_cities,cbsa_states,cbsa_state_new,submarket_name,zip,sqft_min,sqft_max,date_on_market,date_off_market,days_on_market,actual_vacancy,year_on_market,year_off_market,vacant_months
1072,111120848,1376017,12.00,,17.74,,13.00,5,113572927,,,1911.0,3,Other Market Areas,16940.0,Cheyenne,WY,WY,Cheyenne,82001.00,1900.00,1900.00,2007-06-30,2008-06-30,366.00,1.00,2007,2008,12.00
11761,122498431,5526863,12.00,,21.34,,13.00,5,139753571,,,1927.0,3,Other Market Areas,16940.0,Cheyenne,WY,WY,Cheyenne,82001.00,2270.00,2270.00,2014-12-31,2016-03-31,456.00,1.00,2014,2016,15.00
15950,112376098,6535958,60.00,9.00,9.30,0.97,10.00,5,119299479,,,1920.0,2,Other Market Areas,16220.0,Casper,WY,WY,Casper,82601.00,2150.00,2150.00,2009-06-30,2011-03-31,639.00,1.00,2009,2011,21.00
19538,113511878,6540953,36.00,,14.46,,7.00,5,121601478,,,1984.0,3,Other Market Areas,16940.0,Cheyenne,WY,WY,Cheyenne,82009.00,1500.00,1500.00,2008-06-30,2012-12-31,1645.00,1.00,2008,2012,54.00
21004,112447792,803443,36.00,,15.82,,7.00,5,119446572,,2.00,1971.0,3,Other Market Areas,16940.0,Cheyenne,WY,WY,Cheyenne,82001.00,4000.00,11000.00,2007-03-31,2011-03-31,1461.00,1.00,2007,2011,48.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542460,112376108,6535958,12.00,9.00,9.30,0.97,7.00,5,119299492,,,1920.0,2,Other Market Areas,16220.0,Casper,WY,WY,Casper,82601.00,2750.00,2750.00,2011-03-31,2011-06-30,91.00,1.00,2011,2011,3.00
547068,111120870,5526878,36.00,,19.31,,13.00,5,113572953,,,1979.0,3,Other Market Areas,16940.0,Cheyenne,WY,WY,Cheyenne,82001.00,2107.00,2107.00,2007-03-31,2008-06-30,457.00,1.00,2007,2008,15.00
548818,113513902,1376017,36.00,14.00,14.60,0.96,7.00,5,121605108,,,1911.0,3,Other Market Areas,16940.0,Cheyenne,WY,WY,Cheyenne,82001.00,350.00,450.00,2012-09-30,2013-03-31,182.00,1.00,2012,2013,6.00
550523,112678514,803443,36.00,15.00,15.38,0.98,7.00,5,120036255,,1.00,1971.0,3,Other Market Areas,16940.0,Cheyenne,WY,WY,Cheyenne,82001.00,4000.00,6300.00,2007-03-31,2012-03-31,1827.00,1.00,2007,2012,60.00


In [300]:
# get_distributions()

for s in dfm.cbsa_state_new:
    f = Fitter(d['WY'],
               distributions = [
                                'cauchy',
                                'lognorm',
                               ])

f.fit()
f.summary()

TypeError: '<=' not supported between instances of 'float' and 'str'

In [50]:
d = f.get_best(method = 'aic')

dfd = pd.DataFrame(d)

dfd

Unnamed: 0,lognorm
loc,-0.0
s,8.85
scale,3.85


In [44]:
list(get_common_distributions)

TypeError: 'function' object is not iterable

In [42]:
df

<function fitter.fitter.get_common_distributions()>