In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath('../..'))
from models import LogisticRegression, LogisticRegressionUndersampling, LogisticRegressionOversampling, LogisticRegressionSmote
df = pd.read_csv('/home/facuvulcano/Machine-Learning/Vulcano_Facundo_TP3/Problema_1/data/raw/breast_cancer_dev.csv')

In [2]:
class0 = df[df['target'] == 0]['target']
class1 = df[df['target'] == 1]['target']
print(class0)
print(class1)

1      0
2      0
3      0
5      0
6      0
      ..
383    0
384    0
385    0
386    0
387    0
Name: target, Length: 302, dtype: int64
0      1
4      1
7      1
8      1
14     1
      ..
369    1
372    1
380    1
381    1
382    1
Name: target, Length: 86, dtype: int64


## **Logistic Regresion without rebalancing**

In [6]:
reg_witout_rebalncing = LogisticRegression(threshold=0.5, max_iter=1000, learning_rate=0.01, lambda_penalty=0.01)

## **Logistic Regresion with undersampling imbalance: random elimination of majority class samples until both have same proportion**

In [2]:
reg_with_undersampling = LogisticRegressionUndersampling(df, threshold=0.5, max_iter=1000, learning_rate=0.01, lambda_penalty=0.01)
balanced_df = reg_with_undersampling.undersampling()
class0 = balanced_df[balanced_df['target'] == 0]['target']
class1 = balanced_df[balanced_df['target'] == 1]['target']
print(class0)
print(class1)

2      0
9      0
12     0
19     0
24     0
      ..
345    0
346    0
368    0
370    0
379    0
Name: target, Length: 86, dtype: int64
0      1
4      1
7      1
8      1
14     1
      ..
369    1
372    1
380    1
381    1
382    1
Name: target, Length: 86, dtype: int64


## **Logistic Regression with oversampling by duplication: randomly duplicate samples from minority class until both have the same proportion**

In [2]:
reg_with_oversampling = LogisticRegressionOversampling(df, threshold=0.5, max_iter=1000, learning_rate=0.01, lambda_penalty=0.01)
balanced_df = reg_with_oversampling.oversampling()
class0 = balanced_df[balanced_df['target'] == 0]['target']
class1 = balanced_df[balanced_df['target'] == 1]['target']
print(class0)
print(class1)


1      0
2      0
3      0
5      0
6      0
      ..
383    0
384    0
385    0
386    0
387    0
Name: target, Length: 302, dtype: int64
0      1
4      1
7      1
8      1
14     1
      ..
86     1
196    1
117    1
366    1
8      1
Name: target, Length: 302, dtype: int64


## **Logistic Regression with SMOTE oversampling: generates synthetic samples from minority class until both have the same proportion**

In [2]:
reg_with_smote = LogisticRegressionSmote(df, threshold=0.5, max_iter=1000, learning_rate=0.01, lambda_penalty=0.01)
balanced_df = reg_with_smote.smote()
class0 = balanced_df[balanced_df['target'] == 0]['target']
class1 = balanced_df[balanced_df['target'] == 1]['target']
print(class0)
print(class1)

1      0.0
2      0.0
3      0.0
5      0.0
6      0.0
      ... 
383    0.0
384    0.0
385    0.0
386    0.0
387    0.0
Name: target, Length: 302, dtype: float64
0      1.0
4      1.0
7      1.0
8      1.0
14     1.0
      ... 
599    1.0
600    1.0
601    1.0
602    1.0
603    1.0
Name: target, Length: 302, dtype: float64


In [14]:
#Creates synthethic samples for the minority class untill the proportion is the same as the majority class
# gets a sample from the minority class and calculates k=5 (for example) nearest neighbors.
# it then calculates the interpolation of the new sample. with the formula: x_new = x_original + lambda * (x_neighbour - x_original)

#k-nearest neighbors
#we get the euclidian distance from point i to all the other samples.
#We keep the k samples with smallest euclidian distance.


# Sample DataFrame (replace this with your actual dataset)


# import random

# class KNerestNeighbors:
#     def __init__(self, k, df, class1_idxs) -> None:
#         self.k = k
#         self.df = df
#         self.class1_idxs = class1_idxs

#     def _euclidian_distance(self, row1, row2):
#         return np.sqrt(np.sum((row1 - row2) ** 2))

#     def calculate_k_nearest(self):
#         distances_dict = {}
#         for i in range(len(self.class1_idxs)):

#             distances = []
#             for j in range(len(self.class1_idxs)):
#                 if i != j:
#                     dist = self._euclidian_distance(self.df.iloc[self.class1_idxs[i]], self.df.iloc[self.class1_idxs[j]])
#                     distances.append((dist, self.class1_idxs[j]))
                    
#             distances.sort()
#             distances_dict[self.class1_idxs[i]] = distances[:self.k]       
#         return distances_dict

# # Inicialization and neighbour calculation
# knn = KNerestNeighbors(k=5, df=df, class1_idxs=class1_idxs)
# k_nearest_distances = knn.calculate_k_nearest()
# # imbalance between clases
# diff = len(class0) - len(class1)
# new_rows = []
# # Counter to ciclic iterate throw the class1_idxs
# index_counter = 0
# #Create new rows until dataset is balanced
# while len(new_rows) < diff:
#     #get index ciclcly
#     key = class1_idxs[index_counter % len(class1_idxs)]
#     #get x_original based on the key
#     x_original = df.iloc[key]
#     #get a random neighbour from the k nearer
#     value = k_nearest_distances[key]
#     random_neighbour_index = random.choice(value)[1]
#     x_neighbour =  df.iloc[random_neighbour_index]


#     lamb = np.random.uniform(0, 1)

#     new_row = x_original + lamb * (x_neighbour - x_original)
#     new_rows.append(new_row)

#     index_counter += 1


{0: [(0.055444794183309824, 169), (0.05626013103546007, 270), (0.06320975033768471, 54), (0.07426021259617163, 262), (0.08647294226990418, 237)], 4: [(0.07064682707159745, 381), (0.12046070435105206, 26), (0.1226885911167266, 146), (0.12308715134800191, 220), (0.14169778334911046, 318)], 7: [(0.06283111739550216, 176), (0.07145212301086601, 237), (0.09171945753666981, 158), (0.1085646416409678, 369), (0.11286888157143807, 349)], 8: [(0.061188800997155915, 84), (0.06404837053463333, 186), (0.06408336254504371, 14), (0.06498715525902413, 275), (0.066496147556151, 211)], 14: [(0.02450923974552526, 211), (0.06408336254504371, 8), (0.07415662606239984, 84), (0.07724741843500496, 289), (0.08535378527051746, 335)], 17: [(0.08346220680867848, 100), (0.11615705025291433, 171), (0.11639899276546081, 353), (0.11659462966710157, 335), (0.117658619239518, 146)], 18: [(0.07519653054157928, 173), (0.09745258820312072, 220), (0.12047954010536056, 169), (0.12068068092011901, 0), (0.1257593973631509, 26