In [2]:
# RAVEN: A quadratic alternative to PCA

import pandas as pd
import numpy as np
data = pd.read_csv('train1000.csv').drop(columns=['sample_id', 'Unnamed: 0'])
x = data.iloc[:, :556] 
y = data.iloc[:, 556]

In [3]:
#generate pairs of attributes

from itertools import combinations
attributes = x.columns.tolist()
pairs = list(combinations(attributes,2))

In [4]:
sample_size = 100
x_sample_np = x.loc[np.random.choice(x.index, sample_size, False)].to_numpy()
r_squared = {}

for first, second in pairs:
    first_i = x.columns.get_loc(first)
    second_i = x.columns.get_loc(second)
    cov_mat = np.cov(x_sample_np[:, first_i], x_sample_np[:, second_i])
    r_squared[first + " " + second] = cov_mat[0, 1]**2 / cov_mat[0, 0] / cov_mat[1, 1] if all(cov_mat[i, i] != 0 for i in range(2)) else 0

In [61]:
# # calculate r squared for each pair based on a sample

# sample_size = 100
# import numpy as np

# sample_indices = np.random.choice(x.index, size=sample_size, replace=False)
# x_sample_np = x.loc[sample_indices].to_numpy()
# r_squared = {}

# for i, (feature1, feature2) in enumerate(pairs):
#     feature1_index = x.columns.get_loc(feature1)
#     feature2_index = x.columns.get_loc(feature2)
#     cov_matrix = np.cov(x_sample_np[:, feature1_index], x_sample_np[:, feature2_index])
#     cov = cov_matrix[0, 1]
#     r_squared[feature1 + " " + feature2] = (cov**2) / cov_matrix[0, 0] / cov_matrix[1, 1] if cov_matrix[0, 0] != 0 and cov_matrix[1, 1] != 0 else 0

# # for pair in pairs:
#     # try:
#     #     xreg = pair[0]
#     #     yreg = pair[1]

#     #     reg = stats.linregress(x_sample[xreg],x_sample[yreg])
#     #     r_squared[xreg + " " + yreg] = reg.rvalue**2
#     # except:
#         # continue
    



In [6]:
# create a graph with the r squared values, and normalize weights

import networkx as nx

def transform_weight(weight, min_old, max_old, min_new, max_new):
    return (weight - min_old) / (max_old - min_old) * (max_new - min_new) + min_new

def make_graph(edges, threshold=0.95):
    G = nx.Graph()
    for key, value in edges.items():
        if value > threshold:
            u, v = key.split()
            G.add_nodes_from([u,v])
            G.add_edge(u, v, weight=value)

    for u, v, w in G.edges(data=True):
        w['weight'] = transform_weight(w['weight'], 1, 1/threshold, 0.5, 1)

    return G

G = make_graph(r_squared)

# list all articulation poitns in the graph

# articulation_points = list(nx.articulation_points(G))

# print(articulation_points)

In [7]:
%%time
# For each connected component, calculate the closeness centre, visualise

# from matplotlib import pyplot as plt
import math

connected_components = list(nx.connected_components(G))
essential_attrs = []

# num_components = len(connected_components)
# numcols = 3 
# numrows = math.ceil(num_components / numcols)

# fig, axes = plt.subplots(numrows, numcols, figsize=(numcols * 4, numrows * 4))
# fig.tight_layout(pad=4.0)

# axes = axes.flatten()
for i, component in enumerate(connected_components):
    subgraph = G.subgraph(component)
    max_degree_node, _ = max(subgraph.degree(), key = lambda item: item[1])
    essential_attrs.append(max_degree_node)

    # colors = ['red' if node == closeness_center and node == max_degree_node else 
    #           'green' if node == closeness_center else 
    #           'pink' if node == max_degree_node else 
    #           'blue' for node in subgraph.nodes()]
#     nx.draw(subgraph, ax=axes[i], with_labels=True, node_color=colors, font_size=6, node_size=15)
#     axes[i].set_title(f"Component {i+1} \n closeness center: {closeness_center} \n max degree node: {max_degree_node}")

# for j in range(i + 1, len(axes)):
#     axes[j].axis('off')

# plt.show()


CPU times: total: 0 ns
Wall time: 8.8 ms


In [8]:
# train model without removing any attributes

train_size = int(len(x) * 0.8)

x_train = x.iloc[:train_size, :]
y_train = y.iloc[:train_size]
x_test = x.iloc[train_size:, :]
y_test = y.iloc[train_size:]

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse_without_reduction = mean_squared_error(y_test, y_pred)
r2_without_reduction = r2_score(y_test, y_pred)
print("Mean Squared Error without reduction: ", mse_without_reduction)
print("R2 Score without reduction: ", r2_without_reduction)

Mean Squared Error without reduction:  1.712628771132408e-11
R2 Score without reduction:  0.9795469803323228


In [10]:
# train model after removing redundant attributes

essential = essential_attrs
# essential = articulation_points
redundant = [node for node in G.nodes() if node not in essential]
print(f"{len(redundant)/len(x.columns) * 100}% Reduction")

x1 = x.drop(columns=redundant)

64.38848920863309% Reduction


In [11]:
# x1.to_csv('../KANS/train1000_raven.csv')
# x1.shape

In [12]:
x_train = x1.iloc[:train_size, :]
x_test = x1.iloc[train_size:, :]

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

y_train = y.iloc[:train_size]
y_test = y.iloc[train_size:]

model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse_graph = mean_squared_error(y_test, y_pred)
r2_graph = r2_score(y_test, y_pred)
print("Mean Squared Error: ", mse_graph)
print("R2 Score: ", r2_graph)

Mean Squared Error:  1.6598681311190304e-11
R2 Score:  0.9801770727528533
