In [None]:
# Reference: 
# --- https://github.com/atif-hassan/FRUFS
# --- https://github.com/Vevesta/VevestaX
!pip install FRUFS vevestaX

In [None]:
#Import libraries
import math
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold
import time
from sklearn.cluster import KMeans
from matplotlib.pyplot import figure
from sklearn.metrics.cluster import normalized_mutual_info_score

In [None]:
#Import FRUFS and vevestaX and create vevestaX object
from FRUFS import FRUFS
from vevestaX import vevesta as v
V=v.Experiment()

In [None]:
# Load the data into a dataframe (this is ML agnostic so you can use any dataset)
# Replace with your dataset
df = pd.read_csv("https://gist.githubusercontent.com/tijptjik/9408623/raw/b237fa5848349a14a14e5d4107dc7897c21951f5/wine.csv")
df.head()

In [None]:
# Split the data into input features and target variable (replace 'Wine" with your target)
# Replace with your target variable
data, Y = df.drop(['Wine'], axis=1), df['Wine'].values

# Lets check out the shape of our data
print("Data shape: ", data.shape, "Target Variable shape: ", Y.shape)

In [None]:
#extract the names of the features
V.ds = data
#print the names of the features
V.ds

In [None]:
#start the tracking scope of the variables
V.start()
num_classes = 2

In [None]:
# We want to time our algorithm
start = time.time()

NoOfSplits = 10
seed= 27
shuffleFlag = True

# Use KFold for understanding the performance
kfold = KFold(n_splits=NoOfSplits, random_state=seed, shuffle=shuffleFlag)

# This will hold all the accuracy scores
scores = list()

# Perform CV
for train, test in kfold.split(data):
    # Split data into train and test based on folds
    x_train, x_test = data.iloc[train], data.iloc[test]
    y_train, y_test = Y[train], Y[test]
    
    # Convert the data into numpy arrays
    x_train, x_test = x_train.values, x_test.values
    
    noOfFeaturesSelected=6
    
    # Initialize the FRUFS object with your supervised algorithm of choice
    model = FRUFS(model_r=DecisionTreeRegressor(random_state=seed), k=noOfFeaturesSelected, n_jobs=-1, verbose=0, random_state=seed)
 
    # Train the FRUFS model and use it to downsize your data
    x_train = model.fit_transform(x_train)
    x_test = model.transform(x_test)
    
    # Finally, classify on selected features
    model_dt = DecisionTreeClassifier(random_state=seed)
    model_dt.fit(x_train, y_train)
    preds = model_dt.predict(x_test)

    # We are going to use the NMI metric to measure the quality/performance of the clustering 
    score = accuracy_score(y_test, preds)
    print("Score:", score)
    scores.append(score)
    
# Compute average score
averageAccuracy = sum(scores)/len(scores)
print("\n\nAverage Accuracy: ", averageAccuracy)

# Finally, check out the total time taken
end = time.time()
timeTaken = end-start
print("\n\nTotal Time Required (in seconds): ", timeTaken)

In [None]:
#end the tracking scope of variables
V.end()

In [None]:
figure(figsize=(8, 20), dpi=100)
model.feature_importance()

In [None]:
# Download the Excel Workbook (there are MULTIPLE tabs created)
V.dump(techniqueUsed = "Decision tree with FRUFS",message= "4 selected features were used", version=1)