In [1]:
import numpy as np
import pandas as pd
# from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('./dataset/subcellular_location_n2v.csv', encoding='utf8') # reading data

In [3]:
y = df.iloc[:, 0].values  # 读取第一列 label 信息
x = df.iloc[:, 1:].values  # 读取特征矩阵数据
print(x.shape, y.shape)

(5497, 500) (5497,)


In [4]:
print(np.unique(y, return_counts=True)) # numpy 读取分类信息并返回数量

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]), array([1658,   38,  557,   87,  206,   27,  696,    3,  109,   51,  378,
         35,  125, 1422,   48,   57]))


In [5]:
print(df.iloc[:, 0].value_counts()) # pandas return unique counts
print('\n', df.groupby('class').size())

1     1658
14    1422
7      696
3      557
11     378
5      206
13     125
9      109
4       87
16      57
10      51
15      48
2       38
12      35
6       27
8        3
Name: class, dtype: int64

 class
1     1658
2       38
3      557
4       87
5      206
6       27
7      696
8        3
9      109
10      51
11     378
12      35
13     125
14    1422
15      48
16      57
dtype: int64


### Apply SMOTE + ENN

In [6]:
sm = SMOTE(k_neighbors=2)

In [7]:
x_resampled, y_resampled = sm.fit_sample(x, y)

In [8]:
print(np.unique(y_resampled, return_counts=True)) # after over sampleing numpy 读取分类信息并返回数量

(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]), array([1658, 1658, 1658, 1658, 1658, 1658, 1658, 1658, 1658, 1658, 1658,
       1658, 1658, 1658, 1658, 1658]))


In [9]:
print(y.shape, y_resampled.shape) # 原始样本和生成伪样本的数量比较

(5497,) (26528,)


### Get fake data after original set and resampled set perform a difference set operation

In [10]:
# first ndarray convert to dataframe
df_x = pd.DataFrame(x)
df_x_resampled = pd.DataFrame(x_resampled)

In [11]:
print(df_x.shape)
print(df_x_resampled.shape)

(5497, 500)
(26528, 500)


In [12]:
# print(df_x_resampled.iloc[:,0])

In [13]:
# get real data indexes (通过特征矩阵第一列进行筛选，获取真实数据所在的索引位置)
real_indexes = df_x_resampled.iloc[:,0].isin(df_x.iloc[:, 0])

In [14]:
# print real data and fake data counts
print(real_indexes.shape)
print(np.unique(real_indexes.values, return_counts=True))

(26528,)
(array([False,  True]), array([21031,  5497]))


### Get fake data indexes (取反操作)

In [15]:
fake_indexes = [not f for f in real_indexes]
df_fake = df_x_resampled[fake_indexes]

In [16]:
# print fake data shape
print(df_fake.shape)

(21031, 500)


### Split dataset into k consecutive folds

In [17]:
from sklearn.model_selection import KFold

In [18]:
rs = KFold(n_splits=10, shuffle=True, random_state=123)
real_index_set = rs.split(y)
resampled_index_set = rs.split(y_resampled)

In [19]:
real_index_cache = []
for train_index, test_index in real_index_set:
    print(len(test_index))
    real_index_cache.append(test_index.tolist())
print(len(real_index_cache))
# print(real_index_cache[1])

550
550
550
550
550
550
550
549
549
549
10


In [20]:
for train_index, test_index in resampled_index_set:
    print(len(test_index))

2653
2653
2653
2653
2653
2653
2653
2653
2652
2652


In [24]:
print(real_index_cache)

[[30, 31, 42, 43, 72, 77, 92, 95, 97, 116, 119, 137, 152, 159, 162, 165, 184, 188, 193, 198, 213, 220, 222, 238, 239, 242, 244, 252, 257, 263, 284, 299, 319, 332, 333, 334, 352, 364, 379, 380, 381, 426, 429, 440, 444, 445, 457, 476, 511, 517, 518, 523, 527, 537, 543, 547, 552, 563, 586, 608, 614, 646, 653, 658, 661, 671, 681, 694, 704, 722, 723, 724, 730, 738, 758, 770, 774, 785, 804, 841, 849, 889, 893, 918, 923, 925, 927, 935, 967, 969, 973, 976, 994, 1003, 1004, 1024, 1025, 1055, 1066, 1074, 1084, 1085, 1088, 1091, 1103, 1112, 1114, 1121, 1123, 1127, 1148, 1152, 1155, 1161, 1172, 1179, 1186, 1189, 1213, 1229, 1243, 1250, 1254, 1260, 1271, 1284, 1296, 1309, 1313, 1314, 1317, 1335, 1355, 1362, 1364, 1371, 1384, 1395, 1399, 1430, 1432, 1440, 1445, 1448, 1449, 1458, 1463, 1478, 1479, 1499, 1502, 1504, 1509, 1515, 1532, 1544, 1553, 1555, 1567, 1569, 1580, 1583, 1595, 1599, 1607, 1628, 1635, 1637, 1638, 1642, 1643, 1644, 1665, 1667, 1671, 1692, 1693, 1708, 1717, 1723, 1726, 1742, 1746, 17

In [26]:
print(real_index_cache[9])

[14, 16, 17, 32, 34, 39, 60, 89, 93, 96, 111, 113, 126, 133, 180, 191, 194, 231, 255, 262, 268, 271, 281, 287, 293, 295, 321, 323, 329, 330, 342, 361, 371, 389, 390, 407, 413, 416, 424, 434, 438, 441, 443, 454, 455, 460, 464, 481, 508, 538, 544, 545, 555, 557, 581, 588, 604, 609, 612, 639, 672, 686, 687, 695, 713, 719, 731, 744, 757, 762, 763, 788, 794, 819, 821, 826, 836, 837, 845, 855, 867, 874, 881, 892, 915, 922, 924, 931, 936, 942, 950, 979, 998, 1007, 1038, 1061, 1092, 1100, 1104, 1105, 1111, 1118, 1128, 1132, 1133, 1146, 1150, 1153, 1159, 1221, 1222, 1230, 1235, 1246, 1248, 1257, 1258, 1282, 1305, 1312, 1326, 1329, 1336, 1343, 1346, 1363, 1378, 1381, 1382, 1389, 1401, 1406, 1418, 1425, 1427, 1438, 1439, 1442, 1443, 1452, 1474, 1507, 1510, 1513, 1514, 1521, 1524, 1537, 1543, 1551, 1557, 1558, 1593, 1594, 1602, 1639, 1647, 1649, 1656, 1662, 1669, 1675, 1684, 1704, 1731, 1734, 1755, 1759, 1760, 1764, 1771, 1789, 1792, 1794, 1795, 1847, 1871, 1873, 1888, 1891, 1897, 1902, 1922, 1968

In [27]:
print(len(real_index_cache[8]))

549
