## 1.1 数据探查 样例代码

### 导入需要的库

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import kurtosis, skew

### 1.1.1 单变量探查

In [3]:
# 样例数据
data = {'value': [10, 20, 30, 1000, 40, 50]}
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,value
0,10
1,20
2,30
3,1000
4,40


#### 极值

In [5]:
min_value = df['value'].min()
max_value = df['value'].max()

print(f"最小值: {min_value}, 最大值: {max_value}")

最小值: 10, 最大值: 1000


#### 分位数

In [11]:
quantiles = df['value'].quantile([0.25, 0.5, 0.75])
print(quantiles)

0.25    22.5
0.50    35.0
0.75    47.5
Name: value, dtype: float64


#### 峰度偏度

In [10]:
data_kurtosis = kurtosis(df['value'])
data_skewness = skew(df['value'])
print(f"峰度: {data_kurtosis}, 偏度: {data_skewness}")

峰度: 1.190837176139265, 偏度: 1.7837298139192148


### 1.1.2 多变量探查

### 有监督分析

#### logit回归

In [33]:
import statsmodels.api as sm

In [34]:
# 假设样本数据包含一个二分类的因变量和两个自变量
data = pd.DataFrame({
    'y': [0, 1, 0, 1, 1, 0, 1],
    'x1': [1, 2, 3, 2, 3, 1, 4],
    'x2': [5, 3, 6, 4, 7, 2, 8]
})

# 定义因变量和自变量
y = data['y']
X = data[['x1', 'x2']]
X = sm.add_constant(X) # 添加常数项

# 构建Logit模型并拟合数据
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# 输出结果
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.483716
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                    7
Model:                          Logit   Df Residuals:                        4
Method:                           MLE   Df Model:                            2
Date:                Mon, 25 Dec 2023   Pseudo R-squ.:                  0.2917
Time:                        12:31:25   Log-Likelihood:                -3.3860
converged:                       True   LL-Null:                       -4.7804
Covariance Type:            nonrobust   LLR p-value:                    0.2480
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.1173      2.687     -0.788      0.431      -7.383       3.149
x1             2.8202      2.

#### Logistic 回归

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
# 使用相同的样本数据

# 构建Logistic回归模型并拟合数据
logistic_model = LogisticRegression()
logistic_model.fit(X, y)

# 输出模型系数
print(logistic_model.coef_)

[[-1.38289836e-05  6.65549604e-01  3.08989913e-02]]


#### 方差分析

In [37]:
from scipy import stats

In [38]:
# 假设样本数据包含三组不同的数据
group1 = [20, 21, 19, 20, 22]
group2 = [28, 30, 29, 29, 27]
group3 = [18, 19, 17, 20, 21]

# 进行ANOVA分析
f_value, p_value = stats.f_oneway(group1, group2, group3)
print(f'F-value: {f_value}, P-value: {p_value}')

F-value: 79.09803921568614, P-value: 1.228542491020107e-07


### 无监督分析

#### k-means

In [13]:
from sklearn.cluster import KMeans

In [17]:
# 示例数据
X = np.array([[1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0]])

# 应用 K-Means 算法
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

# 输出聚类结果
print("聚类结果：", kmeans.labels_)
# 输出中心点
print("聚类中心点：",kmeans.cluster_centers_)

聚类结果： [1 1 1 0 0 0]
聚类中心点： [[10.  2.]
 [ 1.  2.]]


  super()._check_params_vs_input(X, default_n_init=10)


#### DBSCAN

In [18]:
from sklearn.cluster import DBSCAN

In [20]:
# 示例数据
X = np.array([[1, 2], [2, 2], [2, 3],
              [8, 7], [8, 8], [25, 80]])

# 应用 DBSCAN 算法
dbscan = DBSCAN(eps=3, min_samples=2).fit(X)

# 输出聚类结果
print("聚类结果：",dbscan.labels_)
# 其中-1代表异常值

聚类结果： [ 0  0  0  1  1 -1]


#### 谱聚类

In [21]:
from sklearn.cluster import SpectralClustering

In [32]:
# 示例数据
X = np.array([[1, 1], [2, 1], [1, 0],
              [4, 7], [3, 5], [3, 6]])

# 应用谱聚类算法
clustering = SpectralClustering(n_clusters=2,
        assign_labels='discretize',
        random_state=0).fit(X)

# 输出结果
print(clustering.labels_)

[1 1 1 0 0 0]


#### 因子分析

In [27]:
from sklearn.decomposition import FactorAnalysis

In [28]:
# 示例数据
X = np.array([[0.70, 0.1], [0.3, 0.2], [0.3, 0.7],
              [0.7, 0.4], [0.3, 0.8], [0.9, 0.5]])

# 应用因子分析
factor = FactorAnalysis(n_components=1).fit(X)

# 输出因子载荷
print(factor.components_)

[[-0.14273821  0.14704951]]
