<a href="https://colab.research.google.com/github/sandipanpaul21/Dimensionality-Reduction-in-Python/blob/master/Factor_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Factor Analysis

# Factor Analysis is a method which works in an unsupervised setup 
# It forms groups of features by computing the relationship between the features. 
# It is commonly used to reduce features 
# Factor analysis can only be used to reduce continuous variables of the dataset
# Therefore, we will be removing categorical variables. 
# Again, like principal component analysis, this is also an unsupervised learning algorithm 
# and hence we will be removing the dependent variable from our dataset.

In [21]:
# Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import datasets
#!pip install factor_analyzer
from factor_analyzer import FactorAnalyzer

In [13]:
# Load the Datasets 

# Boston Dataset
boston = datasets.load_boston()
boston_pd = pd.DataFrame(boston.data)
boston_pd.columns = boston.feature_names
boston_pd["MEDV"] = boston.target
boston_pd.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [14]:
# Removing Response Variable

Factor1 = boston_pd[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 
                    'TAX','PTRATIO', 'B', 'LSTAT']]
Factor1.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [15]:
# Creating Correlation Matrix for the above dataset

# To perform factor analysis we first create a correlation matrix using the above dataset.
# We can also manually analyse this matrix as this will give us an idea of the variables 
# that are highly correlated to each other.
corrm = Factor1.corr()
corrm

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
CRIM,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,-0.385064,0.455621
ZN,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995
INDUS,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,-0.356977,0.6038
CHAS,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,0.048788,-0.053929
NOX,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879
RM,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808
AGE,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,-0.273534,0.602339
DIS,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996
RAD,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676
TAX,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993


In [16]:
# Finding Eigen Values

# We will now find the eigenvalues to decide the number of factors 
# that will correctly group the features on the level of their similarity 
# allowing us to manually select features from each of these groups.
eigen_values = np.linalg.eigvals(corrm)
eigen_values_cumvar = (eigen_values/12).cumsum()
eigen_values_cumvar

# Inference : 
# Clearly, the four factors explain approximately 80% of the variance. 
# Therefore, the number of factors will be equal to 4 in our case.

array([0.51057074, 0.63001033, 0.73356172, 0.80502631, 0.87459431,
       0.92937824, 0.97399124, 1.00699935, 1.01229179, 1.0353704 ,
       1.04947898, 1.06498018, 1.08333333])

In [27]:
# Using Factor Analyzer to perform factor analysis

# Compute the factor loadings to group the variables based on their correlation values
Factor_Analysis = FactorAnalyzer(n_factors=4,rotation='varimax',method='ml')
Factor_Analysis.fit(Factor1)

# Here, we have used rotation equal to varimax to get maximum variance 
# and the method deployed for factor analysis is maximum likelihood.
# R and Python use methods – maximum likelihood or minres. 

FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False,
               method='ml', n_factors=4, rotation='varimax', rotation_kwargs={},
               use_smc=True)

In [36]:
# Compute Factor Loading

loadings = Factor_Analysis.loadings_
Variable_Names =['crim','zn','indus','chas','nox','rm','age','dis','rad','tax','ptratio','black','lstat'] 
Matrix = pd.DataFrame(loadings,index = Variable_Names,
                      columns = ['Factor_1','Factor_2','Factor_3','Factor_4'])
Matrix

# Select variables for each of these groups which will help us decreasing the features 
# and decrease the chances of multicollinearity.

Unnamed: 0,Factor_1,Factor_2,Factor_3,Factor_4
crim,0.585458,0.14689,0.14951,0.12124
zn,-0.069552,-0.750364,-0.143407,-0.352882
indus,0.552113,0.573599,0.302909,0.067669
chas,-0.01689,0.150731,-0.114095,-0.151607
nox,0.567399,0.659297,0.235815,-0.166511
rm,-0.1208,-0.123509,-0.758943,-0.23194
age,0.351947,0.715909,0.223779,-0.031676
dis,-0.366249,-0.833285,-0.077686,0.003268
rad,0.897665,0.196578,0.031521,0.243988
tax,0.90812,0.231359,0.139312,0.185353


In [40]:
# We can also compute the proportional variance and cumulative variance of the 4-factor solution.

pd.DataFrame(Factor_Analysis.get_factor_variance(),
             index = ['SS Loading','Population Variance','Cummulative Variance'],
             columns = ['Factor_1','Factor_2','Factor_3','Factor_4'])

Unnamed: 0,Factor_1,Factor_2,Factor_3,Factor_4
SS Loading,3.32616,2.856879,1.33928,0.80753
Population Variance,0.255858,0.21976,0.103022,0.062118
Cummulative Variance,0.255858,0.475618,0.57864,0.640758
