In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import os

# os.chdir('Q:/data')
pd.set_option('display.max_columns', None)

### 对通信消费数据profile_telecom进行因子分析

In [2]:
profile = pd.read_csv("profile_telecom.csv")
profile.head()

Unnamed: 0,ID,cnt_call,cnt_msg,cnt_wei,cnt_web
0,1964627,46,90,36,31
1,3107769,53,2,0,2
2,3686296,28,24,5,8
3,3961002,9,2,0,4
4,4174839,145,2,0,1


In [3]:
data = profile.loc[ :, 'cnt_call':]
data.corr(method='pearson')

Unnamed: 0,cnt_call,cnt_msg,cnt_wei,cnt_web
cnt_call,1.0,0.052096,0.117832,0.11419
cnt_msg,0.052096,1.0,0.510686,0.739506
cnt_wei,0.117832,0.510686,1.0,0.950492
cnt_web,0.11419,0.739506,0.950492,1.0


对数据进行标准化

http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing

In [4]:
from sklearn.preprocessing import scale

data_scaled = scale(data)

telecom_pca = PCA(n_components=2, whiten=True).fit(data_scaled)
telecom_pca.explained_variance_ratio_

array([ 0.62510442,  0.24620209])

In [5]:
telecom_pca.components_

array([[ 0.11085805,  0.50974123,  0.57909319,  0.62651852],
       [ 0.99020127, -0.12736724, -0.01900236, -0.05401806]])

In [6]:
telecom_pca.transform(data_scaled)

array([[ 2.30183615, -0.89716788],
       [-0.74694076, -0.04035188],
       [-0.11502116, -0.59578592],
       ..., 
       [ 2.19333782,  0.04953167],
       [-0.07833419, -0.10944885],
       [-0.6621524 ,  1.16641367]])

### 因子分析

In [7]:
from fa_kit import FactorAnalysis
from fa_kit import plotting as fa_plotting

fa = FactorAnalysis.load_data_samples(
        data_scaled,
        preproc_demean=True,
        preproc_scale=True
        )
fa.extract_components()

In [8]:
fa.find_comps_to_retain(method='top_n',num_keep=2) # 保留2个因子
fa.rotate_components(method='varimax') # 采用最大方差旋转
# fa_plotting.graph_summary(fa) # 因子载荷绘图

fa_comps = pd.DataFrame(fa.comps["rot"])
fa_comps

Unnamed: 0,0,1
0,0.002284,0.996385
1,0.520586,-0.071056
2,0.577715,0.044221
3,0.628674,0.014582


In [9]:
fa_scores = fa.get_component_scores(data_scaled)

pd.DataFrame(fa_scores, columns=['phone_call', 'value_added'])

Unnamed: 0,phone_call,value_added
0,3.718276,-0.488759
1,-1.170693,-0.168666
2,-0.116459,-0.608050
3,-1.053884,-0.829247
4,-1.226677,1.216917
5,-1.102273,1.833129
6,1.228434,-0.496427
7,-1.143891,-0.051681
8,-0.229023,0.332303
9,-1.168964,0.585114
