# Augmentation_8
SMOTE-OUT: Alleviate the problem of SMOTE creating meaninglss synthetic examples in dense distribution

In [86]:
import pandas as pd
data = pd.read_csv("./merged.csv")

In [87]:
X=data
y=data['Petting']

# Smote-out
[Before] Petting: 5 / Non-Petting: 10 --> [After] Petting: 90 / Non-Petting: 10

In [88]:
import sys
sys.path.append('C:/Users/jeony/AppData/Local/Programs/Python/Python311/Lib/site-packages')

In [89]:
import numpy as np

In [90]:
from smote_variants.base import NearestNeighborsWithMetricTensor
from smote_variants.base import OverSampling
from smote_variants.base import coalesce
from smote_variants._logger import logger

In [91]:
_logger = logger
__all__= ['SMOTE_OUT']

class SMOTE_OUT(OverSampling):

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_metric_learning]

    def __init__(self,
                 proportion=17.0, # Create 85 more Petting ==1 Data (5*17=85)
                 n_neighbors=2,
                 *,
                 nn_params=None,
                 n_jobs=1,
                 random_state=None,
                 **_kwargs):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): parameter of the NearestNeighbors component
            nn_params (dict): additional parameters for nearest neighbor calculations, any
                                parameter NearestNeighbors accepts, and additionally use
                                {'metric': 'precomputed', 'metric_learning': '<method>', ...}
                                with <method> in 'ITML', 'LSML' to enable the learning of
                                the metric to be used for neighborhood calculations
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__(random_state=random_state)
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.nn_params = coalesce(nn_params, {})
        self.n_jobs = n_jobs
        self.y_index = 16 # X.columns.get_loc('Petting') = 16 
        
    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable parameter combinations.

        Returns:
            list(dict): a list of meaningful parameter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

    def class_label_statistics(self, y):
        """
        determines class sizes and minority and majority labels
        Args:
            X (np.array): features
            y (np.array): target labels
        """
        unique, counts = np.unique(y, return_counts=True)
        self.class_stats = dict(zip(unique, counts))
        self.min_label = unique[0] if counts[0] < counts[1] else unique[1]
        self.maj_label = unique[1] if counts[0] < counts[1] else unique[0]

    def generate_samples(self, *, X, X_maj, X_min, minority_indices,
                            n_to_sample, maj_indices, min_indices):
        """
        Generate samples

        Args:
            X (np.array): all training vectors
            X_maj (np.array): majority vectors
            X_min (np.array): minority vectors
            minority_indices (np.array): the minority indices
            n_to_sample (int): number of samples to generate
            maj_indices (np.array): majority neighborhood structure
            min_indices (np.array): minority neighborhood structure

        Returns:
            np.array: the generated samples
        """
        base_ind = self.random_state.choice(np.arange(len(minority_indices)),
                                            n_to_sample)

        u = X[minority_indices[base_ind]] # pylint: disable=invalid-name
        neigh_ind = self.random_state.choice(np.arange(0, maj_indices.shape[1]),
                                             n_to_sample)
        v = X_maj[maj_indices[base_ind, neigh_ind]] # pylint: disable=invalid-name
        uu = u + self.random_state.random_sample(u.shape) * 0.3 * (u - v) # pylint: disable=invalid-name
        min_neigh_ind = self.random_state.choice(np.arange(1, min_indices.shape[1]),
                                                 n_to_sample)
        x = X_min[min_indices[base_ind, min_neigh_ind]] # pylint: disable=invalid-name
        return x + self.random_state.random_sample(x.shape) * 0.5 * (uu - x)

    def sampling_algorithm(self, X, y):
        """
        Does the sample generation according to the class parameters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        self.class_label_statistics(y)
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])
        if n_to_sample == 0:
            return self.return_copies(X, y, "Sampling is not needed")

        X_min = X[np.where(y == self.min_label)[0]]
        X_maj = X[np.where(y == self.maj_label)[0]]

        minority_indices = np.where(y == self.min_label)[0]       

        # Nearest neighbors among minority points
        nn_params = {**self.nn_params}
        nn_params['metric_tensor'] = \
            self.metric_tensor_from_nn_params(nn_params, X, y)

        n_neighbors_min = min([len(X_min), self.n_neighbors+1])
        nn_min= NearestNeighborsWithMetricTensor(n_neighbors=n_neighbors_min,
                                                    n_jobs=self.n_jobs,
                                                    **nn_params)
        nn_min.fit(X_min)
        
        min_indices = nn_min.kneighbors(X_min, return_distance=False)
        # nearest neighbors among majority points
        n_neighbors_maj = min([len(X_maj), self.n_neighbors+1])
        nn_maj= NearestNeighborsWithMetricTensor(n_neighbors=n_neighbors_maj,
                                                    n_jobs=self.n_jobs,
                                                    **nn_params)
        nn_maj.fit(X_maj)
        maj_indices = nn_maj.kneighbors(X_min, return_distance=False)

        samples = self.generate_samples(X=X, X_maj=X_maj, X_min=X_min,
                    minority_indices=minority_indices, n_to_sample=n_to_sample,
                    maj_indices=maj_indices, min_indices=min_indices)


        # generate samples
        #samples = []
        #for _ in range(n_to_sample):
        #    # implementation of Algorithm 1 in the paper
        #    random_idx = self.random_state.choice(np.arange(len(minority_indices)))
        #    u = X[minority_indices[random_idx]]
        #    v = X_maj[self.random_state.choice(maj_indices[random_idx])]
        #    dif1 = u - v
        #    uu = u + self.random_state.random_sample()*0.3*dif1
        #    x = X_min[self.random_state.choice(min_indices[random_idx][1:])]
        #    dif2 = uu - x
        #    w = x + self.random_state.random_sample()*0.5*dif2
        #
        #    samples.append(w)

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'nn_params': self.nn_params,
                'n_jobs': self.n_jobs,
                **OverSampling.get_params(self)}

In [92]:
smote_out=SMOTE_OUT()

In [93]:
oversampled = smote_out.sampling_algorithm(np.array(X),np.array(y))

2024-03-19 09:54:54,921:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-03-19 09:54:54,923:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2024-03-19 09:54:54,925:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-03-19 09:54:54,926:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski


In [94]:
df = pd.DataFrame(oversampled[0], columns= X.keys()) 
difference = oversampled[1][len(X['Petting']):]
df.loc[df.index[-len(difference):], 'Petting'] = difference

In [95]:
# Save the new generated data into csv file
df.to_csv("./generated.csv")

# Smote-out
[Before] Petting: 90 / Non-Petting: 10 --> [After] Petting: 90 / Non-Petting: 90

In [96]:
data1 = pd.read_csv("./generated.csv")

In [97]:
X1=data1
y1=data1['Petting']

In [98]:
_logger = logger
__all__= ['SMOTE_OUT']

class SMOTE_OUT1(OverSampling):

    categories = [OverSampling.cat_extensive,
                  OverSampling.cat_metric_learning]

    def __init__(self,
                 proportion=1.0, #Since Petting ==1 has 90 data, synthesize non_petting data same number to that
                 n_neighbors=2,
                 *,
                 nn_params=None,
                 n_jobs=1,
                 random_state=None,
                 **_kwargs):
        """
        Constructor of the sampling object

        Args:
            proportion (float): proportion of the difference of n_maj and n_min
                                to sample e.g. 1.0 means that after sampling
                                the number of minority samples will be equal to
                                the number of majority samples
            n_neighbors (int): parameter of the NearestNeighbors component
            nn_params (dict): additional parameters for nearest neighbor calculations, any
                                parameter NearestNeighbors accepts, and additionally use
                                {'metric': 'precomputed', 'metric_learning': '<method>', ...}
                                with <method> in 'ITML', 'LSML' to enable the learning of
                                the metric to be used for neighborhood calculations
            n_jobs (int): number of parallel jobs
            random_state (int/RandomState/None): initializer of random_state,
                                                    like in sklearn
        """
        super().__init__(random_state=random_state)
        self.check_greater_or_equal(proportion, "proportion", 0)
        self.check_greater_or_equal(n_neighbors, "n_neighbors", 1)
        self.check_n_jobs(n_jobs, 'n_jobs')

        self.proportion = proportion
        self.n_neighbors = n_neighbors
        self.nn_params = coalesce(nn_params, {})
        self.n_jobs = n_jobs
        self.y_index = 16 # X.columns.get_loc('Petting') = 16 
        
    @ classmethod
    def parameter_combinations(cls, raw=False):
        """
        Generates reasonable parameter combinations.

        Returns:
            list(dict): a list of meaningful parameter combinations
        """
        parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75,
                                                 1.0, 1.5, 2.0],
                                  'n_neighbors': [3, 5]}
        return cls.generate_parameter_combinations(parameter_combinations, raw)

    def class_label_statistics(self, y):
        """
        determines class sizes and minority and majority labels
        Args:
            X (np.array): features
            y (np.array): target labels
        """
        unique, counts = np.unique(y, return_counts=True)
        self.class_stats = dict(zip(unique, counts))
        self.min_label = unique[0] if counts[0] < counts[1] else unique[1]
        self.maj_label = unique[1] if counts[0] < counts[1] else unique[0]

    def generate_samples(self, *, X, X_maj, X_min, minority_indices,
                            n_to_sample, maj_indices, min_indices):
        """
        Generate samples

        Args:
            X (np.array): all training vectors
            X_maj (np.array): majority vectors
            X_min (np.array): minority vectors
            minority_indices (np.array): the minority indices
            n_to_sample (int): number of samples to generate
            maj_indices (np.array): majority neighborhood structure
            min_indices (np.array): minority neighborhood structure

        Returns:
            np.array: the generated samples
        """
        base_ind = self.random_state.choice(np.arange(len(minority_indices)),
                                            n_to_sample)

        u = X[minority_indices[base_ind]] # pylint: disable=invalid-name
        neigh_ind = self.random_state.choice(np.arange(0, maj_indices.shape[1]),
                                             n_to_sample)
        v = X_maj[maj_indices[base_ind, neigh_ind]] # pylint: disable=invalid-name
        uu = u + self.random_state.random_sample(u.shape) * 0.3 * (u - v) # pylint: disable=invalid-name
        min_neigh_ind = self.random_state.choice(np.arange(1, min_indices.shape[1]),
                                                 n_to_sample)
        x = X_min[min_indices[base_ind, min_neigh_ind]] # pylint: disable=invalid-name
        return x + self.random_state.random_sample(x.shape) * 0.5 * (uu - x)

    def sampling_algorithm(self, X, y):
        """
        Does the sample generation according to the class parameters.

        Args:
            X (np.ndarray): training set
            y (np.array): target labels

        Returns:
            (np.ndarray, np.array): the extended training set and target labels
        """
        # n_to_sample=5

        self.class_label_statistics(y)
        n_to_sample = self.det_n_to_sample(self.proportion,
                                           self.class_stats[self.maj_label],
                                           self.class_stats[self.min_label])
        if n_to_sample == 0:
            return self.return_copies(X, y, "Sampling is not needed")
 
        X_min = X[np.where(y == self.min_label)[0]]
        X_maj = X[np.where(y == self.maj_label)[0]]

        minority_indices = np.where(y == self.min_label)[0]
        

        # Nearest neighbors among minority points
        nn_params = {**self.nn_params}
        nn_params['metric_tensor'] = \
            self.metric_tensor_from_nn_params(nn_params, X, y)

        n_neighbors_min = min([len(X_min), self.n_neighbors+1])
        nn_min= NearestNeighborsWithMetricTensor(n_neighbors=n_neighbors_min,
                                                    n_jobs=self.n_jobs,
                                                    **nn_params)
        nn_min.fit(X_min)
        
        min_indices = nn_min.kneighbors(X_min, return_distance=False)
        
        # Nearest neighbors among majority points
        n_neighbors_maj = min([len(X_maj), self.n_neighbors+1])
        nn_maj= NearestNeighborsWithMetricTensor(n_neighbors=n_neighbors_maj,
                                                    n_jobs=self.n_jobs,
                                                    **nn_params)
        nn_maj.fit(X_maj)
        maj_indices = nn_maj.kneighbors(X_min, return_distance=False)

        samples = self.generate_samples(X=X, X_maj=X_maj, X_min=X_min,
                    minority_indices=minority_indices, n_to_sample=n_to_sample,
                    maj_indices=maj_indices, min_indices=min_indices)


        # generate samples
        #samples = []
        #for _ in range(n_to_sample):
        #    # implementation of Algorithm 1 in the paper
        #    random_idx = self.random_state.choice(np.arange(len(minority_indices)))
        #    u = X[minority_indices[random_idx]]
        #    v = X_maj[self.random_state.choice(maj_indices[random_idx])]
        #    dif1 = u - v
        #    uu = u + self.random_state.random_sample()*0.3*dif1
        #    x = X_min[self.random_state.choice(min_indices[random_idx][1:])]
        #    dif2 = uu - x
        #    w = x + self.random_state.random_sample()*0.5*dif2
        #
        #    samples.append(w)

        return (np.vstack([X, samples]),
                np.hstack([y, np.repeat(self.min_label, len(samples))]))

    def get_params(self, deep=False):
        """
        Returns:
            dict: the parameters of the current sampling object
        """
        return {'proportion': self.proportion,
                'n_neighbors': self.n_neighbors,
                'nn_params': self.nn_params,
                'n_jobs': self.n_jobs,
                **OverSampling.get_params(self)}

In [99]:
smote_out1=SMOTE_OUT1()
oversampled1 = smote_out1.sampling_algorithm(np.array(X1),np.array(y1))
df1 = pd.DataFrame(oversampled1[0], columns= X1.keys())

2024-03-19 09:54:55,065:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-03-19 09:54:55,066:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski
2024-03-19 09:54:55,068:INFO:NearestNeighborsWithMetricTensor: NN fitting with metric minkowski
2024-03-19 09:54:55,069:INFO:NearestNeighborsWithMetricTensor: kneighbors query minkowski


In [100]:
difference1 = oversampled1[1][len(X1['Petting']):]
df1.loc[df1.index[-len(difference1):], 'Petting'] = difference1

In [101]:
# Save the result o csv file
df1.to_csv("./result.csv")

## Random Forest

In [102]:
df = pd.read_csv("./result.csv")

In [103]:
# Import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.metrics import roc_curve, roc_auc_score

In [104]:
def random_forest(data):
    
    # Split the data into features (X) and target (y)
    X = data.drop('Petting', axis=1)
    y = data['Petting']
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    y_pred_prob = rf.predict_proba(X_test)[:, 1]
    
    # Calc acc, auc
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    return accuracy, roc_auc

In [105]:
# For more accurate result, try 5 times
acc1,auc1=random_forest(df)
acc2,auc2=random_forest(df)
acc3,auc3=random_forest(df)
acc4,auc4=random_forest(df)
acc5,auc5=random_forest(df)

In [106]:
acc_avg=(acc1+acc2+acc3+acc4+acc5)/5
auc_avg=(auc1+auc2+auc3+auc4+auc5)/5
print("acc:", acc_avg)
print("auc:", auc_avg)

acc: 0.9666666666666666
auc: 0.9837098768534528
