In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler

In [2]:
data = pd.read_excel('datasets/dataset-made-out-of-9-with-all-but-40.xlsx')

In [3]:
string_lengths = data['end_target_text_extended'].str.len()
string_length_distribution = string_lengths.value_counts()
string_length_distribution, len(data)

(end_target_text_extended
 1617    9
 1945    8
 1609    7
 1748    7
 1864    7
        ..
 1793    1
 3308    1
 2600    1
 2732    1
 1461    1
 Name: count, Length: 1299, dtype: int64,
 2551)

In [4]:
data['end_target'].value_counts(), string_lengths.describe()

(end_target
 No target                     1509
 Net zero                       668
 Carbon neutral(ity)            225
 Emissions reduction target     149
 Name: count, dtype: int64,
 count    2551.000000
 mean     1990.972560
 std       462.539718
 min       859.000000
 25%      1673.000000
 50%      1936.000000
 75%      2249.500000
 max      5531.000000
 Name: end_target_text_extended, dtype: float64)

In [5]:
df = data

ros = RandomOverSampler(sampling_strategy='not majority', random_state=42)

# Separate the features and the target
X = df.drop(columns=['end_target'])
y = df['end_target']

# Fit and apply the oversampler
X_resampled, y_resampled = ros.fit_resample(X, y)

df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['end_target'])], axis=1)

print(df_resampled['end_target'].value_counts())

end_target
Net zero                      1509
Carbon neutral(ity)           1509
Emissions reduction target    1509
No target                     1509
Name: count, dtype: int64


In [6]:
df_resampled.to_excel('datasets/10-oversampled-dataset-with-even-classes-final.xlsx', index=False)