In [2]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

- Feature 0: entero
- Feature 1:  water_temp
- Feature 2: do
- Feature 3: ph
- Feature 4: chlorophyll
- Feature 5: density
- Feature 6: fecal
- Feature 7:  air_temp
- Feature 8:  humidity
- Feature 9: windspeed 
- Feature 10: cloud_cover
- Feature 11: solar_radiation

# Pearson Neural Net Coefficients

In [5]:
# create pearson array
pearson = np.array([0.235, 0.401, 0.189, 0.194, 0.192, -0.390, 0.363, 0.733, 0.113, 0.376, 0.030, 0.380])
# create an array of the rankings
sorted_pearson = sorted(range(len(pearson)), key=lambda i: pearson[i])
pearson_ranks = [sorted_pearson.index(i) + 1 for i in range(len(pearson))]
# print info
print("Salinity pearson array:")
print(pearson)
print("Rankings:")
print(pearson_ranks)

Salinity pearson array:
[ 0.235  0.401  0.189  0.194  0.192 -0.39   0.363  0.733  0.113  0.376
  0.03   0.38 ]
Rankings:
[7, 11, 4, 6, 5, 1, 8, 12, 3, 9, 2, 10]


# Pearson Pair-wise Features Coefficients

In [6]:
# load dataset
data_df = pd.read_csv('water_quality_and_weather.csv')
# drop columns that are not featues
data_df.drop(columns=['station', 'Date', 'Time', 'SALINITY'], inplace=True)
# initialize matrices 
num_vars = 12
corr_coef = np.zeros((num_vars, num_vars))
p_vals = np.zeros((num_vars, num_vars))
# fill matrices with pair-wise pearson relationships
for i in range(num_vars):
    for j in range(num_vars):
        corr_coef[i, j],  p_vals[i, j] = pearsonr(data_df.iloc[:, i], data_df.iloc[:, j])
# ignore relationship with itself
np.fill_diagonal(corr_coef, 0)
# print info 
np.set_printoptions(precision=2, suppress=True)
print("Correlation Coefficient Matrix ")
print(corr_coef)
print("P-values Matrix")
print(p_vals)

Correlation Coefficient Matrix 
[[ 0.    0.03  0.11  0.09  0.13 -0.07  0.78 -0.05  0.04  0.03  0.05 -0.  ]
 [ 0.03  0.    0.51  0.57  0.05 -0.98  0.01  0.36  0.18  0.15  0.07  0.06]
 [ 0.11  0.51  0.    0.69  0.23 -0.54  0.09  0.06  0.12  0.17  0.1   0.06]
 [ 0.09  0.57  0.69  0.    0.25 -0.53  0.07  0.12  0.07  0.25  0.04  0.22]
 [ 0.13  0.05  0.23  0.25  0.   -0.02  0.1   0.08  0.09  0.07  0.03  0.08]
 [-0.07 -0.98 -0.54 -0.53 -0.02  0.   -0.06 -0.34 -0.18 -0.13 -0.05 -0.01]
 [ 0.78  0.01  0.09  0.07  0.1  -0.06  0.   -0.06  0.04  0.05  0.04  0.02]
 [-0.05  0.36  0.06  0.12  0.08 -0.34 -0.06  0.   -0.34  0.41 -0.24  0.51]
 [ 0.04  0.18  0.12  0.07  0.09 -0.18  0.04 -0.34  0.    0.08  0.48 -0.34]
 [ 0.03  0.15  0.17  0.25  0.07 -0.13  0.05  0.41  0.08  0.    0.03  0.5 ]
 [ 0.05  0.07  0.1   0.04  0.03 -0.05  0.04 -0.24  0.48  0.03  0.   -0.48]
 [-0.    0.06  0.06  0.22  0.08 -0.01  0.02  0.51 -0.34  0.5  -0.48  0.  ]]
P-values Matrix
[[0.   0.24 0.   0.   0.   0.   0.   0.04 0.08 0.21

In [7]:
# create empty array
weighted_feaures = np.array([])
# dot product (salinity rankings) x (pairwise features matrix)
for c in corr_coef:
    weigh_features = c * pearson_ranks
    weigh_result = np.sum(weigh_features)
    weighted_feaures = np.append(weighted_feaures, weigh_result)
# normalize the array
normalized_featureWeights = weighted_feaures / np.sum(weighted_feaures)
# create an array of the rankings
sorted_featureWeights = sorted(range(len(normalized_featureWeights)), key=lambda i: normalized_featureWeights[i])
featureWeights_ranks = [sorted_featureWeights.index(i) + 1 for i in range(len(normalized_featureWeights))]
# print info 
np.set_printoptions(precision=2, suppress=True)
print("Weighted features array:")
print(normalized_featureWeights)
print("Rankings:")
print(featureWeights_ranks)

Weighted features array:
[ 0.11  0.16  0.2   0.23  0.1  -0.31  0.09  0.15 -0.03  0.2  -0.05  0.15]
Rankings:
[6, 9, 11, 12, 5, 1, 4, 8, 3, 10, 2, 7]


In [8]:
# Combine weighted pair-wise featues & sailinty relatons (70/30 weighted split)
alpha = 0.5
beta = 0.5
combined_weights = alpha * pearson + beta * weighted_feaures

# normalize
normalize_combined = combined_weights / np.sum(combined_weights) 
# create an array of the rankings
sorted_combined = sorted(range(len(normalized_featureWeights)), key=lambda i: normalized_featureWeights[i])
combined_ranks = [sorted_combined.index(i) + 1 for i in range(len(normalized_featureWeights))]
# print info
np.set_printoptions(precision=2, suppress=True)
print("Combined weights before normalization: ")
print(combined_weights)
print("Combined weights after normalization: ")
print(normalize_combined)
print("Ranks:")
print(combined_ranks)

Combined weights before normalization: 
[  4.14   6.2    7.73   8.69   3.79 -11.75   3.72   6.11  -1.01   7.6
  -1.97   5.91]
Combined weights after normalization: 
[ 0.11  0.16  0.2   0.22  0.1  -0.3   0.09  0.16 -0.03  0.19 -0.05  0.15]
Ranks:
[6, 9, 11, 12, 5, 1, 4, 8, 3, 10, 2, 7]
