In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('50statescensus2020.csv')
print(data.columns)
variables = ['HH_MEDIAN', 'BUYINGPOWER10_6', 'POPULATION', 'WHITE', 'HISPANIC']
print(variables)

In [None]:
descriptive_stats = data[variables].describe()
print(np.round(descriptive_stats, 2))
for col in variables:
    plt.figure(figsize = (6, 4))
    sns.histplot(data[col], bins = 10, kde = True, edgecolor = 'black')
    plt.title('Histogram of {}'.format(col))
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.xticks(rotation = 45)
    plt.ticklabel_format(style = 'plain', axis = 'x') 

In [None]:
for col in variables:
    plt.figure(figsize = (3, 6))
    sns.boxplot(data = data, y = col)
    plt.title('Box Plot of {}'.format(col))
    plt.xlabel(col)
    plt.ticklabel_format(style = 'plain', axis = 'y')

In [None]:
from scipy.stats import pearsonr

In [None]:
selected_vars = ['HH_MEDIAN', 'BUYINGPOWER10_6', 'POPULATION']
corr_mat = data[selected_vars].corr()
print('\nCorrelation Matrix:')
print(corr_mat)

In [None]:
def corrfunc(x, y, **kwargs):
    stat, pval = pearsonr(x, y)
    ax = plt.gca()
    ax.annotate('r = {}'.format(np.round(stat, 2)),  xy = (0.6, 0.1),
                xycoords = ax.transAxes, fontsize = 12, color = 'Green')
g = sns.PairGrid(data[selected_vars], diag_sharey = False) 
g.map_upper(sns.regplot, scatter_kws = {'alpha': 0.3})
g.map_upper(corrfunc)                                  
g.map_diag(sns.histplot, kde = False)                 
g.map_lower(sns.kdeplot, cmap = 'Spectral_r')           

plt.suptitle('Scatterplots with Correlation Coefficients', y = 1.02)
plt.savefig('CorrMat.png', dpi = 300, bbox_inches = 'tight')

In [None]:
from scipy import stats


In [None]:
plt.figure(figsize = (10, 10))
stats.probplot(data['POPULATION'], dist = 'norm', plot = plt)
plt.title('Q-Q plot of original POPULATION')

In [None]:
data['Log_POPULATION'] = np.log(data['POPULATION'])
plt.figure(figsize = (10, 10))
stats.probplot(data['Log_POPULATION'], dist = 'norm', plot = plt)
plt.title('Q-Q plot of log-transformed POPULATION')

In [None]:
plt.figure(figsize = (20, 10))
plt.subplot(1, 2, 1)
stats.probplot(data['POPULATION'], dist = 'norm', plot = plt)
plt.title('Q-Q plot of original POPULATION')
plt.subplot(1, 2, 2)
stats.probplot(data['Log_POPULATION'], dist = 'norm', plot = plt)
plt.title('Q-Q plot of log-transformed POPULATION')
plt.savefig('QQ.png', dpi = 300, bbox_inches = 'tight')

In [None]:
variables = ['HH_MEDIAN', 'POPULATION', 'Log_POPULATION']
g = sns.PairGrid(data[variables], diag_sharey = False) 
g.map_upper(sns.regplot, scatter_kws = {'alpha': 0.3})
g.map_upper(corrfunc)                                  
g.map_diag(sns.histplot, kde = False)                  
g.map_lower(sns.kdeplot, cmap = 'Spectral_r')            
plt.suptitle('Scatterplots with Correlation Coefficients', y = 1.02)

In [None]:
data['Log_HH_MEDIAN'] = np.log(data['HH_MEDIAN'])
plt.figure(figsize = (10, 10))
stats.probplot(data['Log_HH_MEDIAN'], dist = 'norm', plot = plt)
plt.title('Q-Q plot of log-transformed HH_MEDIAN')

In [None]:
variables = ['Log_HH_MEDIAN', 'Log_POPULATION']
g = sns.PairGrid(data[variables], diag_sharey = False) 
g.map_upper(sns.regplot, scatter_kws = {'alpha': 0.3})
g.map_upper(corrfunc)                                  
g.map_diag(sns.histplot, kde = False)                  
g.map_lower(sns.kdeplot, cmap = 'Spectral_r')            
plt.suptitle('Scatterplots with Correlation Coefficients', y = 1.02)