forked from JanetMatsen/bacteriopop
-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_selection_utils.py
64 lines (60 loc) · 2.29 KB
/
feature_selection_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr
from sklearn.decomposition import PCA
def calculate_features_target_correlation(data, features, target, method):
"""
This function receives the pandas dataframe, the list of features, and
a string that representing the name of the target column in data frame.
The output is the correlation of each numeric feature vector with the
target values and their bar plot.
:type data: pandas dataframe
:param data: data set to be analysed
:type features: list of strings
:param features: the list of features that are being analysed
:type target: string
:param target: the column of data set that is treated as the output for
prediction
:type method: string
:param method: the method for finding the correlation
coefficients and can be "Pearson" or "Cross-cor"
:return cor: a list of correlation coefficients between features and
target values
"""
# target values are assigned to y
y = data[target]
# correlations list
cor = []
# Pearson correlation coefficients
if method == 'Pearson':
for f in features:
cor.append(pearsonr(data[f], y)[0])
# Cross-correlation coefficients
elif method == 'Cross-cor':
for f in features:
cor.append(np.correlate(data[f], y))
# Bar plot of correlation coefficients
ind = np.arange(len(cor))
width = 0.6 # the width of the bars
fig, ax = plt.subplots()
ax.bar(ind, cor, width, color='y')
ax.set_ylabel('Correlation coefficients')
ax.set_title('correlation coefficient for each feature vector')
ax.set_xticks(ind + width/2.0)
ax.set_xticklabels(features, rotation='vertical')
plt.show()
return cor
def pca_bacteria(data, n_components):
"""
calculate Principal Components Analysis from data in a pandas dataframe
and n_components represents the desire number of principal components
:type data: pandas dataframe
:param data: data set to be analysed
:type n_components: int
:param n_components: the number of desired principal components
:return pca: information about the principal components of data
"""
assert isinstance(n_components, int)
pca = PCA(n_components)
pca.fit(data)
return pca