In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import warnings
import statsmodels.api as regression

In [None]:
data = pd.read_csv('EvictionMemphis2020.csv')
variables = ['EvicTotal','higherEduc']
data_selected = data[variables]
data_cleaned = data_selected[(data_selected['EvicTotal'] > 0) 
                             & (data_selected['higherEduc'] > 0)]
print('# of observations (original):', len(data))
print('# of observations (cleaned):', len(data_cleaned))

In [None]:
# Descriptive stats for non-transformed
descriptive_stats = data_cleaned.describe()
print("Descriptive statistics for the selected variables:")
print(descriptive_stats)

In [None]:
#skewness for non-transformed
skewness_values = data_cleaned.skew()
print("Skewness of the selected variables:")
print(skewness_values)

In [None]:
def corrfunc(x, y, **kwargs):
    stat, pval = pearsonr(x, y)
    ax = plt.gca() 
    ax.annotate('r = {}'.format(np.round(stat, 2)),  xy = (0.6, 0.1),
                xycoords = ax.transAxes, fontsize = 12, color = 'red') 
g = sns.PairGrid(data_cleaned, diag_sharey = False)
g.map_upper(sns.regplot, scatter_kws = {'alpha': 0.3}) 
g.map_upper(corrfunc)                                  
g.map_diag(sns.histplot, kde = False)                  
g.map_lower(sns.kdeplot, cmap = 'Spectral_r')                       
plt.suptitle('Scatterplot for the variables', 
             y = 1.1)
plt.show()

---

In [None]:
data_log = np.log(data_cleaned)
data_log = data_log.rename(columns = {'EvicTotal': 'Log_EvicTotal',
                                      'higherEduc': 'Log_higherEduc' })

In [None]:
# stats for log-transformed
descriptive_stats = data_log.describe()
print("Descriptive statistics for log variables:")
print(descriptive_stats)

In [None]:
#skewness for log transformed
skewness_values = data_log.skew()
print("Skewness of the log variables:")
print(skewness_values)

In [None]:
def corrfunc(x, y, **kwargs):
    stat, pval = pearsonr(x, y)
    ax = plt.gca() 
    ax.annotate('r = {}'.format(np.round(stat, 2)),  xy = (0.6, 0.1),
                xycoords = ax.transAxes, fontsize = 12, color = 'red') 
g = sns.PairGrid(data_log, diag_sharey = False)
g.map_upper(sns.regplot, scatter_kws = {'alpha': 0.3}) 
g.map_upper(corrfunc)                                  
g.map_diag(sns.histplot, kde = False)                  
g.map_lower(sns.kdeplot, cmap = 'Spectral_r')                       
plt.suptitle('Scatterplot for the log variables', 
             y = 1.1)
plt.show()

---

In [None]:
from IPython.display import HTML

HTML('<span style="color:red;">From here on out I use both variables in their log-transformed state since the skewness was closer to zero for both.</span>')



In [None]:
# UGLY custom code because the lab code broke 'x is not defined' even tho it never was in the lab. WHY
X_log = data_log['Log_higherEduc'] 
y_log = data_log['Log_EvicTotal']
X_log_with_const = sm.add_constant(X_log)
model = regression.OLS(y_log, X_log_with_const).fit()
print(model.summary())

In [None]:
# regression for log-transformed variables
plt.figure(figsize = (8, 6))
sns.regplot(x = 'Log_higherEduc', y = 'Log_EvicTotal', data = data_log, 
            scatter_kws = {'alpha': 0.5}, line_kws = {'color': 'blue'})
plt.title('Linear Regression: Log_EvicTotal vs Log_higherEduc')
plt.show()

# SOME MORE UGLY AF CODE BECAUSE THE EXAMPL LAB CODE ABSOLUTELY COULD NOT HANDLE LIFE. final regression-standardized-thingy using Log-transformed variables
X_log = data_log['Log_higherEduc'] 
y_log = data_log['Log_EvicTotal'] 
X_log_with_const = sm.add_constant(X_log)
model = regression.OLS(y_log, X_log_with_const).fit()
y_pred = model.predict(X_log_with_const)
residuals = y_log - y_pred
standardized_residuals = residuals / np.std(residuals)

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, standardized_residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.axhline(y=2, color='g', linestyle='--', linewidth=1) 
plt.axhline(y=-2, color='g', linestyle='--', linewidth=1)
plt.title('Standardized vs. Fitted values')
plt.xlabel('Fitted Values (Predicted Log_EvicTotal)')
plt.ylabel('Standardized residuals')
plt.show()