In [1]:
import pandas as pd
import numpy as np
import json
import ast
import pickle

#  Functions

In [2]:
def _create_sparse_df(df):
    # # ***** create sparse one-hot encoded dataset *******
    X = pd.DataFrame(0, index=range(df.shape[0]), columns=['col_{}'.format(i) for i in range(70)])

    def set_values(row):
        A = row['A']
        B = row['B']
        C = row['C']
        X.loc[row.name, 'col_{}'.format(A)] = 1
        X.loc[row.name, 'col_{}'.format(B)] = 1
        X.loc[row.name, 'col_{}'.format(C)] = 1

    df.apply(set_values, axis=1)

    X = pd.get_dummies(df[['A', 'B', 'C']], columns=['A', 'B', 'C'], prefix='col')
    X = X.groupby(level=0, axis=1).max()

    X['feasibility'] = df['2'].copy()
    X['optimality'] = df['5'].copy()
    return X

def _read_df(num):
    FILENAME = 'temp_files/temp_df_pop_{}.csv'.format(num)
    
    df = pd.read_csv(FILENAME)
    df['1'] = df['1'].apply(lambda x: json.loads(x))
    df['3'] = df['3'].apply(lambda x: ast.literal_eval(x))
    df['3'] = [list(t) for t in df['3']]
    df[['A', 'B', 'C']] = df['1'].apply(lambda x: pd.Series(x))
    df.drop(columns=['3','4'], inplace=True)
    return df

def _read_rank_list(num):
    FILENAME = 'temp_files/rank_list_{}.pickle'.format(num)
    
    with open(FILENAME, 'rb') as f:
        rank_list = pickle.load(f)
        
    rank_list = pd.DataFrame(rank_list)
    return rank_list

# Read data

In [3]:
df_1 = _read_df(1)
rank_list_1 = _read_rank_list(1)

df_2 = _read_df(2)
rank_list_2 = _read_rank_list(2)

In [4]:
rank_list_elm  = rank_list_1[0].values
idx = df_1[df_1['0'].isin(rank_list_elm)].index 
df_1['selected'] = 0
# Set a value in the 'A' column of the selected rows
df_1.loc[idx, 'selected'] = 1

rank_list_elm  = rank_list_2[0].values
idx = df_2[df_2['0'].isin(rank_list_elm)].index 
df_2['selected'] = 0
# Set a value in the 'A' column of the selected rows
df_2.loc[idx, 'selected'] = 1

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

X_train = df_1[['A','B','C','2']].copy()
y_train = df_1['selected'].copy()

X_test = df_2[['A','B','C','2']].copy()
y_test = df_2['selected'].copy()

print(X_train.shape, X_test.shape)

(5007, 4) (3789, 4)


# Over/under sampling techniques

In [10]:
from imblearn.under_sampling import RandomUnderSampler

# Create an instance of the RandomUnderSampler class
rus = RandomUnderSampler()

# Perform undersampling on the feature and target data
X_train, y_train = rus.fit_resample(X_train, y_train)

In [15]:
# Define the SVM classifier
svc = SVC()

# Define the hyperparameters to tune and their possible values
params = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# Create an instance of the GridSearchCV class
grid_search = GridSearchCV(svc, params, cv=3, 
                           scoring='matthews_corrcoef', verbose=2)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

model = grid_search.best_estimator_
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

score = matthews_corrcoef(y_test, y_pred)
print(score)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END ................C=0.001, gamma=0.001, kernel=linear; total time=   0.3s
[CV] END ................C=0.001, gamma=0.001, kernel=linear; total time=   0.4s
[CV] END ................C=0.001, gamma=0.001, kernel=linear; total time=   0.3s
[CV] END ...................C=0.001, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ...................C=0.001, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END ...................C=0.001, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END .................C=0.001, gamma=0.01, kernel=linear; total time=   0.3s
[CV] END .................C=0.001, gamma=0.01, kernel=linear; total time=   0.4s
[CV] END .................C=0.001, gamma=0.01, kernel=linear; total time=   0.3s
[CV] END ....................C=0.001, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ....................C=0.001, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ....................C=0.001, gamma=0.0

[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=   0.4s
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=   0.6s
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=   0.5s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=   0.1s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=   0.5s
[CV] END .........................C=1, gamma=0.1, kernel=rbf; total time=   0.1s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   0.4s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   0.6s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   0.5s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=   2.8s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=   2.8s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=   2.7s
[CV] END ...................

In [16]:
print(score)

0.0


# Selected Cuts

In [102]:
df = pd.read_csv('Results/DataFrame_clustering.csv', index_col=0)
df.drop(['curr_pt', 'Xarr_inds', 'dim_act'], axis=1, inplace=True)
df['set_inds'] = df['set_inds'].apply(lambda x: json.loads(x))
df[['A', 'B', 'C']] = df['set_inds'].apply(lambda x: pd.Series(x))
df

Unnamed: 0,agg_idx,set_inds,-eigeval,A,B,C
0,1858,"[7, 22, 63]",0.724745,7,22,63
1,2480,"[10, 29, 54]",0.724745,10,29,54
2,1905,"[7, 30, 52]",0.724745,7,30,52
3,4947,"[27, 29, 55]",0.724745,27,29,55
4,1854,"[7, 22, 52]",0.724745,7,22,52
...,...,...,...,...,...,...
95,3206,"[14, 34, 35]",0.018608,14,34,35
96,1561,"[6, 18, 40]",0.018545,6,18,40
97,698,"[2, 38, 60]",0.018490,2,38,60
98,3528,"[16, 42, 45]",0.018445,16,42,45


In [103]:
X = _create_sparse_df(df)

In [104]:
df_100 = df[0:100].copy()
df_200 = df[100:200].copy()

X_100 = _create_sparse_df(df_100)
X_200 = _create_sparse_df(df_200)

df_1800 = df[1800:1900].copy()
df_1900 = df[1900:2000].copy()

X_1800 = _create_sparse_df(df_1800)
X_1900 = _create_sparse_df(df_1900)