In [1]:
import random
import pandas as pd
import numpy as np

random.seed(99)

# we first craft a set of synthetic factors
factor_1 = np.concatenate((np.ones(50), 0 * np.ones(50)), axis=None)
factor_2 = np.concatenate((2 * np.ones(30), 4 * np.ones(30), 7 * np.ones(40)), axis=None)

# randomly shuffle them
random.shuffle(factor_1)
random.shuffle(factor_2)

# add factors into a dataframe
factors = pd.DataFrame({'factor_1': factor_1, 'factor_2': factor_2})

factors

Unnamed: 0,factor_1,factor_2
0,0.0,7.0
1,1.0,4.0
2,0.0,2.0
3,1.0,2.0
4,1.0,7.0
...,...,...
95,1.0,7.0
96,0.0,2.0
97,1.0,2.0
98,1.0,4.0


In [2]:
np.random.seed(99)

# now we craft the variables
variable_1 = factor_1 + 2
variable_2 = 4 * factor_1 + factor_2
variable_3 = factor_2 ** 2 / 10
variable_4 = factor_1 * 2 + factor_2 ** 3 / 50
variable_5 = (factor_2 - factor_1) ** 2 / 5

# matrix with all variables
variables = np.array([variable_1, variable_2, variable_3, variable_4, variable_5])

# generate some noise
mu, sigma = 0, 1
noise = np.array([np.random.normal(mu, sigma, 100) for i in range(5)])
variables += noise

# create dataframe with the generated data
variables = pd.DataFrame(variables.T, columns=['variable_1', 'variable_2', 'variable_3', 'variable_4', 'variable_5'])

variables

Unnamed: 0,variable_1,variable_2,variable_3,variable_4,variable_5
0,1.857641,5.913115,6.265634,7.577789,8.798739
1,5.057222,8.821163,0.619181,1.685170,0.780196
2,2.283262,1.212051,-0.176520,-1.223520,0.948517
3,4.329812,7.593812,0.585510,2.301480,-1.205349
4,2.845378,12.405904,4.504327,9.571892,7.343937
...,...,...,...,...,...
95,3.933959,10.050342,6.088059,9.612718,8.159540
96,3.552634,2.517564,-0.606062,-0.167863,0.399166
97,3.093623,7.293981,-0.877345,2.696739,1.670456
98,3.294826,8.760001,2.620386,2.702047,2.736106


In [3]:
from sklearn.decomposition import FactorAnalysis

# use FA to extract features from the variables
extracted_factors = FactorAnalysis(n_components=2, random_state=0).fit_transform(variables)

# convert np array into a dataframe
extracted_factors = pd.DataFrame(extracted_factors, columns=['extracted_1', 'extracted_2'])

extracted_factors

Unnamed: 0,extracted_1,extracted_2
0,0.811316,-1.399923
1,-0.384761,1.674665
2,-1.508278,-0.751931
3,-0.682483,1.761205
4,1.474493,0.982128
...,...,...
95,1.374104,0.116531
96,-1.349820,-0.130894
97,-0.472519,0.971207
98,-0.024983,1.019687


In [4]:
# concatenate the extracted factors with the original factors
factors = pd.concat([factors, extracted_factors], axis=1)

factors

Unnamed: 0,factor_1,factor_2,extracted_1,extracted_2
0,0.0,7.0,0.811316,-1.399923
1,1.0,4.0,-0.384761,1.674665
2,0.0,2.0,-1.508278,-0.751931
3,1.0,2.0,-0.682483,1.761205
4,1.0,7.0,1.474493,0.982128
...,...,...,...,...
95,1.0,7.0,1.374104,0.116531
96,0.0,2.0,-1.349820,-0.130894
97,1.0,2.0,-0.472519,0.971207
98,1.0,4.0,-0.024983,1.019687


In [5]:
# check the correlation matrix between original and extracted factors
factors.corr()

Unnamed: 0,factor_1,factor_2,extracted_1,extracted_2
factor_1,1.0,0.123391,0.3024857,0.8514946
factor_2,0.123391,1.0,0.9593489,-0.1728431
extracted_1,0.302486,0.959349,1.0,1.439787e-14
extracted_2,0.851495,-0.172843,1.439787e-14,1.0
