In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import FactorAnalysis
import factor_analyzer as FA
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity

In [2]:
data = pd.read_csv("Data/data.csv", index_col = [0])
data

Unnamed: 0,state,county,area_sqmi,epl_pov,epl_unemp,epl_pci,epl_nohsdp,spl_theme1,rpl_theme1,epl_age65,...,f_minrty,f_limeng,f_theme3,f_munit,f_mobile,f_crowd,f_noveh,f_groupq,f_theme4,f_total
0,ALABAMA,Autauga,594.443459,0.5401,0.2745,0.2860,0.4397,1.5403,0.3631,0.1850,...,0,0,0,0,0,0,0,0,0,0
1,ALABAMA,Baldwin,1589.793007,0.2239,0.3121,0.2057,0.3209,1.0626,0.2232,0.6428,...,0,0,0,1,0,0,0,0,1,1
2,ALABAMA,Barbour,885.001636,0.9631,0.9217,0.9481,0.9701,3.8030,0.9780,0.4893,...,0,0,0,0,1,0,0,1,2,8
3,ALABAMA,Bibb,622.461089,0.4430,0.7895,0.8987,0.7351,2.8663,0.7694,0.3200,...,0,0,0,0,1,0,0,1,2,2
4,ALABAMA,Blount,644.830460,0.4723,0.2611,0.7561,0.8405,2.3300,0.6143,0.4715,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,WYOMING,Sweetwater,10426.975725,0.3127,0.4490,0.1497,0.2668,1.1782,0.2576,0.0293,...,0,0,0,0,0,0,0,0,0,1
3138,WYOMING,Teton,3996.844622,0.0522,0.0264,0.0051,0.0548,0.1385,0.0025,0.1334,...,0,1,1,0,0,1,0,0,1,2
3139,WYOMING,Uinta,2081.719807,0.3430,0.6032,0.4443,0.1347,1.5252,0.3583,0.0675,...,0,0,0,0,0,0,0,0,0,1
3140,WYOMING,Washakie,2238.672972,0.3360,0.6207,0.4035,0.3792,1.7394,0.4293,0.7380,...,0,0,0,0,0,0,0,0,0,0


## Bartlett's Test of Sphericity

One aspect of the data that we need to check is whether or not the data is suitable for a technique like factor analysis or principal component analysis. Thus we perform Bartlett's Test of Sphericity. This test will compare the correlation matrix of our data to an identity matrix. If our variables are orthogonal to each other, uncorrelated, then factor analysis and pca are techniques that will be of no benefit to this data, since the data can not be reduced further. 

Our null hypthosis is that these variables are orthogonal, while the alternative is that the variables are not orthogonal. If our p-value is greater than a chosen $\alpha$, in this 0.05, we can reject the null and conclude that dimension reduction techniques are an effective method for the data. 

In [7]:
chi2, p = calculate_bartlett_sphericity(data.iloc[:,2:])
print("Our chi squared statistic is ",chi2)
print("Our p-value from our chi statistic is ", p)

Our chi squared statistic is  1164605.4395881242
Our p-value from our Chi statistic is  0.0


Since our p-value is far below $\alpha = 0.05$, we can conclude that this data is suitable for factor analysis. 

In [26]:
fa = FA.FactorAnalyzer(rotation = None,impute = "drop",n_factors=data.shape[1])
fa.fit(data.iloc[:,2:])
ev,_ = fa.get_eigenvalues()
print(ev.shape)
#plt.scatter(range(1,data.shape[1]+1),ev)
px.scatter(x = range(1,data.shape[1]-1), y = ev, labels = {'x': "Factors", "y": 'Eigenvalues'}, title = "Scree Plot for Factor Analysis", color_discrete_sequence = ['#e377c2'])

(46,)
