In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import math
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from scipy import stats
import itertools as it
import sklearn.metrics as metrics
import sklearn.model_selection as selection
import os
import json
import pickle as pic
import importlib
import sys
sys.path.append("./../utils")
sys.path.append("./../sklearn")
import util
import constants as const
import ColumnTransformer
importlib.reload(util)
importlib.reload(const)
importlib.reload(ColumnTransformer)
pd.set_option('max_columns', None)
pd.set_option('max_colwidth', None)
pd.set_option('max_rows', None)
pd.set_option('precision', 6)



# 1. Load Data

In [None]:
# Real-world graphs, R-MAT and Albert Barabsi
combined_graphs_encoded = pd.read_csv("../data/combined.csv")
# Wiki graphs for enrichment
graphs_for_enrichment = pd.read_csv("../data/enrichment.csv")
# The used partitioners
partitioners = list(combined_graphs_encoded.partitioner.unique())
print(partitioners)


# 2. Train

In [None]:
MODEL_NAME = "RFR"

TARGETS = [
    "vertex_balance", 
    "destination_balance", 
    "source_balance", 
    "edge_balance", 
    "replication_factor"
    ]
for TARGET in TARGETS:
    base_mode = partitioners + [
        'num_partitions', 
        "pearson_mode_degrees_in",
        "pearson_mode_degrees_out",
        "mean_degree",
        "density",
    ]
    easy_mode = [base_mode]
    hard_mode = [base_mode + ["mean_triangles", "average_lcc"]]

    all_feature_sets=[
        easy_mode, 
        hard_mode
    ]
    
    all_feature_sets_description=[
        "Easy", 
        "Hard"
    ]
    RESULT = {}

    for feature_index in range(len(all_feature_sets)):
        print("#######################################################################################################################")
        print("We are starting with features:", all_feature_sets[feature_index])
        print("#######################################################################################################################")
        
        if (not TARGET == "replication_factor") and all_feature_sets_description[feature_index] == "Hard":
            print("We do not train")
            continue

        model = util.get_rfr( 
            features=all_feature_sets[feature_index],
            estimators=[100, 300, 500],
            depths = [10,20,30]
    )

        rmat_types = [
            "rmat-medium", 
            "rmat-small",
        ]

        real_world_types = [
             'realworld-web', 
             'realworld-internet', 
             'realworld-interaction',    
             'realworld-soc', 
             'realworld-product_network',
             'realworld-collaboration', 
             'realworld-citation',
             'realworld-cummunication',
             'realworld-affiliation',
             'realworld-wiki'

        ]

        data_train_validate_test = [] # [(X_train, X_val, X_test, y_train, y_val, y_test)]
        data_description = []
        enrich_by = []

        for _enrich_by in [0.0, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.8, 0.8, 1.0]:
        #for _enrich_by in [0.0]:
            rmat_on_real = [
                {"graph_type": rmat_types, "training_size": 0.8, "validation_size": 0.2, "test_size": 0 },
                {"graph_type": real_world_types, "training_size": 0, "validation_size": 0, "test_size": 1 },
                {"enrich_by": _enrich_by },
            ]   
            data_train_validate_test.append(util.get_train_validate_test(
                combined_graphs_encoded, 
                graphs_for_enrichment,
                target=TARGET,
                configurations=rmat_on_real)) 
            data_description.append("rmat-enriched-with-real")
            enrich_by.append(_enrich_by)

        validation_results, test_results = util.train(
            model=model, 
            data_train_validate_test=data_train_validate_test, 
            data_description=data_description, 
            enrich_by=enrich_by,
            used_feature_set=all_feature_sets_description[feature_index], 
            target=TARGET,
            model_name=MODEL_NAME)

        
        RESULT[all_feature_sets_description[feature_index]+"-validation"] = validation_results
        RESULT[all_feature_sets_description[feature_index]+"-test"] = test_results
        
    for scores in RESULT.keys():
        print("store", scores)
        RESULT[scores].to_csv("../models/{}_{}_{}".format(MODEL_NAME, scores, TARGET))