In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import warnings

In [None]:
data = pd.read_csv('EvictionMemphis2020.csv')
variables = ['EvicTotal','HH_Median']
data_selected = data[variables]
data_cleaned = data_selected[(data_selected['EvicTotal'] > 0) 
                             & (data_selected['HH_Median'] > 0)]
print('# of observations (original):', len(data))
print('# of observations (cleaned):', len(data_cleaned))

In [None]:
descriptive_stats = data_cleaned.describe()
print("Descriptive statistics for the selected variables:")
print(descriptive_stats)

In [None]:
skewness_values = data_cleaned.skew()
print("Skewness of the selected variables:")
print(skewness_values)

In [None]:
def corrfunc(x, y, **kwargs):
    stat, pval = pearsonr(x, y)
    ax = plt.gca() 
    ax.annotate('r = {}'.format(np.round(stat, 2)),  xy = (0.6, 0.1),
                xycoords = ax.transAxes, fontsize = 12, color = 'red') 
g = sns.PairGrid(data_cleaned, diag_sharey = False)
g.map_upper(sns.regplot, scatter_kws = {'alpha': 0.3}) 
g.map_upper(corrfunc)                                  
g.map_diag(sns.histplot, kde = False)                  
g.map_lower(sns.kdeplot, cmap = 'Spectral_r')                       
plt.suptitle('Scatterplot for the variables', 
             y = 1.1)
plt.show()

In [None]:
data_log = np.log(data_cleaned)
data_log = data_log.rename(columns = {'EvicTotal': 'Log_EvicTotal',
                                      'HH_Median': 'Log_HH_Median' })

In [None]:
descriptive_stats = data_log.describe()
print("Descriptive statistics for log variables:")
print(descriptive_stats)

In [None]:
skewness_values = data_log.skew()
print("Skewness of the log variables:")
print(skewness_values)

In [None]:
def corrfunc(x, y, **kwargs):
    stat, pval = pearsonr(x, y)
    ax = plt.gca() 
    ax.annotate('r = {}'.format(np.round(stat, 2)),  xy = (0.6, 0.1),
                xycoords = ax.transAxes, fontsize = 12, color = 'red') 
g = sns.PairGrid(data_log, diag_sharey = False)
g.map_upper(sns.regplot, scatter_kws = {'alpha': 0.3}) 
g.map_upper(corrfunc)                                  
g.map_diag(sns.histplot, kde = False)                  
g.map_lower(sns.kdeplot, cmap = 'Spectral_r')                       
plt.suptitle('Scatterplot for the log variables', 
             y = 1.1)
plt.show()

In [None]:
import statsmodels.api as regression

In [None]:
X = data_cleaned['HH_Median']  
y = data_cleaned['EvicTotal']
X = regression.add_constant(X)
model = regression.OLS(y, X).fit()
print(model.summary())

In [None]:
plt.figure(figsize = (8, 6))
sns.regplot(x = 'HH_Median', y = 'EvicTotal', data = data_cleaned, 
            scatter_kws = {'alpha': 0.5}, line_kws = {'color': 'blue'})
plt.title('Linear Regression: EvicTotal vs HH_Median')
plt.show()

In [None]:
y_pred = model.predict(X)
residuals = y - y_pred
standardized_residuals = residuals / np.std(residuals)
plt.figure(figsize = (8, 6))
plt.scatter(y_pred, standardized_residuals, alpha = 0.5)
plt.axhline(y = 0, color='r', linestyle = '--', linewidth = 2)
plt.axhline(y = 2, color='g', linestyle = '--', linewidth = 1) 
plt.axhline(y = -2, color='g', linestyle = '--', linewidth = 1)
plt.title('Standardized vs. Fitted values')
plt.xlabel('Fitted Values (Predicted EvicTotal)')
plt.ylabel('Standardized residuals')
plt.show()

In [None]:
X_log = data_log['Log_HH_Median']  
y_log = data_log['Log_EvicTotal']
X_log = regression.add_constant(X)
model = regression.OLS(y_log, X_log).fit()
print(model.summary())

In [None]:
plt.figure(figsize = (8, 6))
sns.regplot(x = 'Log_HH_Median', y = 'Log_EvicTotal', data = data_log, 
            scatter_kws = {'alpha': 0.5}, line_kws = {'color': 'blue'})
plt.title('Linear Regression: Log_EvicTotal vs Log_HH_Median')
plt.show()

In [None]:
y_pred = model.predict(X_log)
residuals = y_log - y_pred
standardized_residuals = residuals / np.std(residuals)
plt.figure(figsize = (8, 6))
plt.scatter(y_pred, standardized_residuals, alpha = 0.5)
plt.axhline(y = 0, color='r', linestyle = '--', linewidth = 2)
plt.axhline(y = 2, color='g', linestyle = '--', linewidth = 1) 
plt.axhline(y = -2, color='g', linestyle = '--', linewidth = 1)
plt.title('Standardized vs. Fitted values')
plt.xlabel('Fitted Values (Predicted EvicTotal)')
plt.ylabel('Standardized residuals')
plt.show()