In [None]:
# RAVEN: A quadratic alternative to PCA

import pandas as pd
data = pd.read_csv('train10000.csv').drop(columns=['sample_id', 'Unnamed: 0'])
x = data.iloc[:, :556]
y = data.iloc[:, 556] 

In [None]:
#generate pairs of attributes

from itertools import combinations
attributes = x.columns.tolist()
pairs = list(combinations(attributes,2))

In [None]:
# calculate r squared for each pair based on a sample

from scipy import stats

sample_size = 100

x_sample = x.sample(sample_size)

r_squared = {}

for pair in pairs:
    try:
        xreg = pair[0]
        yreg = pair[1]

        reg = stats.linregress(x_sample[xreg],x_sample[yreg])
        r_squared[xreg + " " + yreg] = reg.rvalue**2
    except:
        continue

In [None]:
# create a graph with the r squared values, and normalize weights

import networkx as nx

def transform_weight(weight, min_old, max_old, min_new, max_new):
    return (weight - min_old) / (max_old - min_old) * (max_new - min_new) + min_new

def make_graph(edges, threshold=0.95):
    G = nx.Graph()
    for key, value in edges.items():
        if value > threshold:
            u, v = key.split()
            G.add_nodes_from([u,v])
            G.add_edge(u, v, weight=value)

    for u, v, w in G.edges(data=True):
        w['weight'] = transform_weight(w['weight'], 1, 1/threshold, 0.5, 1)

    return G

G = make_graph(r_squared)

In [None]:
# For each connected component, calculate the closeness centre, visualise

from matplotlib import pyplot as plt
import math
import networkx as nx

connected_components = list(nx.connected_components(G))
essential_attrs = []

num_components = len(connected_components)
numcols = 3 
numrows = math.ceil(num_components / numcols)

fig, axes = plt.subplots(numrows, numcols, figsize=(numcols * 4, numrows * 4))
fig.tight_layout(pad=4.0)

axes = axes.flatten()

for i, component in enumerate(connected_components):
    subgraph = G.subgraph(component)
    closeness_centrality = nx.closeness_centrality(subgraph)
    closeness_center = max(closeness_centrality, key=closeness_centrality.get)
    essential_attrs.append(closeness_center)

    colors = ['red' if node == closeness_center else 'blue' for node in subgraph.nodes()]
    nx.draw(subgraph, ax=axes[i], with_labels=True, node_color=colors, font_size=6, node_size=15)
    axes[i].set_title(f"Component {i+1} \n closeness center: {closeness_center}")

for j in range(i + 1, len(axes)):
    axes[j].axis('off')

plt.show()


In [None]:
# train model without removing any attributes

train_size = int(len(x) * 0.8)

x_train = x.iloc[:train_size, :]
y_train = y.iloc[:train_size]
x_test = x.iloc[train_size:, :]
y_test = y.iloc[train_size:]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error without reduction: ", mse)
print("R2 Score without reduction: ", r2)

In [None]:
# train model after removing redundant attributes

essential = essential_attrs
redundant = [node for node in G.nodes() if node not in essential]
print(f"{len(redundant)/len(x.columns) * 100}% Reduction")

x1 = x.drop(columns=redundant)

In [None]:
x_train = x1.iloc[:train_size, :]
x_test = x1.iloc[train_size:, :]

In [318]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error: ", mse)
print("R2 Score: ", r2)

Mean Squared Error:  1.0242232975316345e-11
R2 Score:  0.9870473418114519
