In [1]:
import numpy as np
import pandas as pd
import os, shutil, re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report 

In [12]:
def bivar_transform(value, standard):
    """
    parms:
    value: input label
    standard: the label that is preserved, all other labels are set to be 'other'
    standrd is in {'1', '2', 'b', 'c', 'm', 'p', 't', 'x'}
    """
    if value != standard:
        return 'other'
    else:
        return standard
    
def bivar(data, standard):
    data_cp = data.copy()
    for i in range(data.shape[0]):
        temp = bivar_transform(data.iloc[:,-1][i], standard)
        data_cp.iloc[:,-1][i] = temp
    return data_cp

In [16]:
main_path = os.getcwd()
dir_list = os.listdir(os.path.join(main_path,'p1'))
def lopo(p, standard):
    """
    LOPO
    parms:
    p: leave one participant out p in [1,2,3,4,5,6,7]
    return: two dictionaries (train, test)
    """
    train_data = {}
    test_data = {}
    main_list = ['p'+str(i) for i in range(1,8)]
    main_list.remove('p'+str(p))
    for i in main_list:
        for j in dir_list:
            d = pd.read_csv(os.path.join(main_path,i,j),low_memory=False)
            d = bivar(d, standard)
            d = np.array(d)
            if train_data.get(j) is None:
                train_data[j] = d
            else:
                train_data[j] = np.r_[train_data[j],d]
    
    for i in dir_list:
        d = pd.read_csv(os.path.join(main_path,'p'+str(p),i),low_memory=False)
        d = bivar(d, standard)
        d = np.array(d)
        test_data[i] = d
    return train_data, test_data

In [17]:
def lopo_count(data, standard):
    """
    parms:
    data: could be one of train_data and test_data
    standard: same as bivar_transform
    return: tuple contains the number of two labels (standard and 'other')
    """
    standard_count, other_count = 0, 0
    for i in data:
        count = Counter(data[i][:,-1])
        if len(count) == 2:
            other_count += count['other']
            standard_count += count[standard]
        else:
            other_count += count['other']
    return standard_count, other_count

In [23]:
"""
这里用 'm' 和 p7 为例
"""
train_data, test_data = lopo(p=7, standard='m')
m_count_train, other_count_train = lopo_count(train_data, standard='m')
m_count_test, other_count_test = lopo_count(test_data, standard='m')
print('In train_data: ' + str(m_count_train) + " 'm' " + str(other_count_train) + " 'other' ")
print('In test_data: ' + str(m_count_test) + " 'm' " + str(other_count_test) + " 'other' ")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cp.iloc[:,-1][i] = temp


In train_data: 1111 'm' 34148 'other' 
In test_data: 91 'm' 6581 'other' 


In [27]:
"""
对每个场景单独训练模型一共6个,这里我用了rf和adaboost
"""
def model_dict(data, model: str):
    """
    parms:
    model: model in ['rf','adaboost']
    """
    model_dict = dict()
    if model == 'rf':
        for i in data:
            rfc = RandomForestClassifier(max_depth=30, random_state=0) #max_depth ~ sqrt(n_features=1000)
            rfc.fit(data[i][:,:-1],data[i][:,-1])
            model_dict[i] = rfc
    elif model == 'adaboost':
        for i in data:
            ada = AdaBoostClassifier(n_estimators=50, random_state=0)
            ada.fit(data[i][:,:-1],data[i][:,-1])
            model_dict[i] = ada
    return model_dict

In [28]:
model_dict_rf = model_dict(train_data, model='rf')
model_dict_ada = model_dict(train_data, model='adaboost')

In [30]:
"""
rf/adaboost as the baseline and calculate the score for test_data
"""
for i in test_data:
    print('score for ' + i)
    print('rf',model_dict_rf[i].score(test_data[i][:,:-1],test_data[i][:,-1]))
    print('adaboost',model_dict_ada[i].score(test_data[i][:,:-1],test_data[i][:,-1]))
    print("\n")

score for outdoor.csv
rf 0.9576023391812866
adaboost 0.9298245614035088


score for reading.csv
rf 1.0
adaboost 1.0


score for call.csv
rf 0.9546351084812623
adaboost 0.9349112426035503


score for dinner.csv
rf 0.9943342776203966
adaboost 0.9902423670129052


score for game.csv
rf 0.9586206896551724
adaboost 0.9482758620689655


score for TV.csv
rf 1.0
adaboost 1.0




In [31]:
"""
强烈建议试试其他人或者其他标签
"""

'\n强烈建议试试其他人或者其他标签\n'

In [32]:
"""
这里以下到下一个注释是之前写的有点乱。。。
表示的是不做二分类，强行用rf/adaboost对六个场景所有标签直接分类. 效果很差
"""
main_path = os.getcwd()
dir_list = os.listdir(os.path.join(main_path,'p1'))
original_data = {}
main_list = ['p'+str(i) for i in range(1,7)]
for i in main_list:
    for j in dir_list:
        d = pd.read_csv(os.path.join(main_path,i,j),low_memory=False)
        d = np.array(d)
        if original_data.get(j) is None:
            original_data[j] = d
        else:
            original_data[j] = np.r_[original_data[j],d]

In [33]:
original_test_data = dict()
for i in dir_list:
    d = pd.read_csv(os.path.join(main_path,'p7',i),low_memory=False)
    d = np.array(d)
    original_test_data[i] = d

In [155]:
original_model_dict = dict()
for i in data:
    rfc = RandomForestClassifier(max_depth=30, random_state=0)
    rfc.fit(original_data[i][:,:-1],original_data[i][:,-1])
    original_model_dict[i] = rfc

In [172]:
for i in original_test_data:
    temp_data = original_test_data[i]
    print("\n",i)
    y_pred = original_model_dict[i].predict(temp_data[:,:-1]) 
    print(classification_report(original_test_data[i][:,-1],y_pred))


 outdoor.csv
              precision    recall  f1-score   support

           1       0.71      0.74      0.72       273
           2       0.28      0.31      0.29       100
           b       0.86      0.87      0.86       284
           m       0.00      0.00      0.00        26
           x       0.00      0.00      0.00         1

    accuracy                           0.70       684
   macro avg       0.37      0.38      0.38       684
weighted avg       0.68      0.70      0.69       684


 reading.csv
              precision    recall  f1-score   support

           1       0.97      0.99      0.98       213
           b       0.86      0.67      0.75        18

    accuracy                           0.97       231
   macro avg       0.91      0.83      0.87       231
weighted avg       0.96      0.97      0.96       231


 call.csv
              precision    recall  f1-score   support

           1       0.91      0.98      0.94       336
           b       0.00      0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.93      0.69      0.79       557
           2       0.39      0.38      0.39       198
           b       0.70      0.94      0.81      1462
           m       0.00      0.00      0.00        18
           p       0.00      0.00      0.00         7
           t       0.73      0.48      0.58       922
           x       0.00      0.00      0.00        13

    accuracy                           0.72      3177
   macro avg       0.39      0.36      0.37      3177
weighted avg       0.72      0.72      0.70      3177


 game.csv
              precision    recall  f1-score   support

           1       0.72      0.82      0.77       201
           2       0.59      0.57      0.58        72
           b       0.90      0.91      0.90       278
           m       0.00      0.00      0.00        24
           t       0.00      0.00      0.00         0
           x       0.00      0.00      0.00         5

    accuracy

In [34]:
"""
每个场景的train/test标签对比情况
"""
for i in original_test_data:
    print(i)
    print(Counter(original_data[i][:,-1]))
    print(Counter(original_test_data[i][:,-1]))
    print("\n")

outdoor.csv
Counter({'b': 791, '1': 768, '2': 746, 'm': 179, 'x': 125, 'c': 82})
Counter({'b': 284, '1': 273, '2': 100, 'm': 26, 'x': 1})


reading.csv
Counter({'1': 1084, 'b': 138, 'x': 5, '2': 1, 'm': 1})
Counter({'1': 213, 'b': 18})


call.csv
Counter({'p': 1112, '1': 973, 'b': 142, 'm': 129, 'c': 13, '2': 5})
Counter({'1': 336, 'p': 125, 'b': 23, 'm': 23})


dinner.csv
Counter({'b': 8191, 't': 5634, '2': 3056, '1': 2236, 'm': 430, 'c': 404, 'x': 323, 'p': 29})
Counter({'b': 1462, 't': 922, '1': 557, '2': 198, 'm': 18, 'x': 13, 'p': 7})


game.csv
Counter({'1': 1905, 'b': 1544, '2': 896, 'm': 343, 't': 281, 'c': 62, 'x': 39})
Counter({'b': 278, '1': 201, '2': 72, 'm': 24, 'x': 5})


TV.csv
Counter({'t': 2726, '2': 284, '1': 245, 'b': 202, 'c': 81, 'm': 29, 'x': 25})
Counter({'t': 1428, '1': 35, 'b': 30})




In [178]:
original_adaboost_dict = dict()
for i in data:
    rfc = AdaBoostClassifier(n_estimators=100, random_state=0)
    rfc.fit(original_data[i][:,:-1],original_data[i][:,-1])
    original_adaboost_dict[i] = rfc

In [179]:
for i in original_test_data:
    temp_data = original_test_data[i]
    print("\n",i)
    y_pred = original_adaboost_dict[i].predict(temp_data[:,:-1]) 
    print(classification_report(original_test_data[i][:,-1],y_pred))


 outdoor.csv


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.68      0.61      0.64       273
           2       0.07      0.06      0.07       100
           b       0.76      0.86      0.81       284
           c       0.00      0.00      0.00         0
           m       0.13      0.12      0.12        26
           x       0.00      0.00      0.00         1

    accuracy                           0.61       684
   macro avg       0.27      0.27      0.27       684
weighted avg       0.60      0.61      0.61       684


 reading.csv
              precision    recall  f1-score   support

           1       0.98      0.99      0.98       213
           b       0.87      0.72      0.79        18

    accuracy                           0.97       231
   macro avg       0.92      0.86      0.89       231
weighted avg       0.97      0.97      0.97       231


 call.csv


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.89      0.98      0.93       336
           b       0.00      0.00      0.00        23
           m       0.00      0.00      0.00        23
           p       0.80      0.87      0.83       125

    accuracy                           0.86       507
   macro avg       0.42      0.46      0.44       507
weighted avg       0.78      0.86      0.82       507


 dinner.csv


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.81      0.63      0.71       557
           2       0.20      0.16      0.18       198
           b       0.69      0.86      0.77      1462
           c       0.00      0.00      0.00         0
           m       0.05      0.11      0.07        18
           p       0.00      0.00      0.00         7
           t       0.64      0.50      0.56       922
           x       0.00      0.00      0.00        13

    accuracy                           0.66      3177
   macro avg       0.30      0.28      0.28      3177
weighted avg       0.66      0.66      0.65      3177


 game.csv


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.55      0.32      0.41       201
           2       0.58      0.31      0.40        72
           b       0.85      0.78      0.81       278
           c       0.00      0.00      0.00         0
           m       0.09      0.46      0.15        24
           t       0.00      0.00      0.00         0
           x       0.00      0.00      0.00         5

    accuracy                           0.54       580
   macro avg       0.29      0.27      0.25       580
weighted avg       0.67      0.54      0.59       580


 TV.csv
              precision    recall  f1-score   support

           1       0.81      0.37      0.51        35
           2       0.00      0.00      0.00         0
           b       0.01      0.03      0.02        30
           c       0.00      0.00      0.00         0
           m       0.00      0.00      0.00         0
           t       0.97      0.94      0.96      1428
           x   

  _warn_prf(average, modifier, msg_start, len(result))


In [181]:
"""
这边开始是 cleanlab 洗标签过程
[这部分我昨天做的时候是对6个场景分别训一个模型然后分别洗掉错误标签] 但过程里其实是有点小问题的，他这个算法原理需要做交叉验证然后从之前那个train/test label分布里面我们就看到在某些场景中
其实有些label可能只有很少(甚至只有1个)。所以就意味着不能split,就会出现点bug

[所以可能我觉得可以考虑不分场景先把数据洗一遍]
"""
"""
以下代码都是昨天写的，只针对outdoor.csv这一个场景做了测试
先把train和test各自洗一遍（实际情况是test由于某个label只有一个这个算法报错了，于是没洗test直接做的测试，结果也是很一般 '1'这个类基本区分不出来）
"""
from cleanlab.classification import LearningWithNoisyLabels
temp_data = original_data['outdoor.csv']
temp_test = original_test_data['outdoor.csv']

In [190]:
def temp_convert(value):
    if value == '1':
        return 0
    elif value == '2':
        return 1
    elif value == 'b':
        return 2
    elif value == 'c':
        return 3
    elif value == 'm':
        return 4
    elif value == 'x':
        return 5

def temp_transform(data):
    data_cp = data.copy()
    for i in range(data.shape[0]):
        temp = temp_convert(data[:,-1][i])
        data_cp[:,-1][i] = temp
    return data_cp

In [191]:
temp_data = temp_transform(temp_data)
temp_test = temp_transform(temp_test)

In [206]:
temp_data_X = np.array(temp_data[:,:-1],dtype=np.float)
temp_data_Y = np.array(temp_data[:,-1],dtype=np.int64)

In [267]:
temp_test_X = np.array(temp_test[:,:-1],dtype=np.float)
temp_test_Y = np.array(temp_test[:,-1],dtype=np.int64)

In [215]:
# 其实可以封装任意一个你自定义的模型.
lnl = LearningWithNoisyLabels(clf=RandomForestClassifier(max_depth=30,random_state=0))
lnl.fit(X=temp_data_X, s=temp_data_Y)
# 对真实世界进行验证.
predicted_test_labels = lnl.predict(temp_test_X)
# print(classification_report(temp_test_Y,predicted_test_labels))

In [214]:
import cleanlab
psx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(
    temp_data_X, temp_data_Y, clf=RandomForestClassifier(max_depth=30, random_state=0))

In [223]:
from cleanlab.pruning import get_noise_indices
psx = np.asarray(psx)
ordered_label_errors = get_noise_indices(
    s=temp_data_Y,
    psx=psx,
    sorted_index_method='normalized_margin', # Orders label errors
 )

In [237]:
index_list_data = np.array(list(range(len(temp_data_X))))
index_list_test = np.array(list(range(len(temp_test_X))))

In [254]:
cp_index_list_data = []
for i in index_list_data:
    if i in ordered_label_errors:
        cp_index_list_data.append(False)
    else:
        cp_index_list_data.append(True)

In [260]:
rfc = RandomForestClassifier(max_depth=30, random_state=0)
rfc.fit(temp_data_X[cp_index_list_data,:],temp_data_Y[cp_index_list_data])

RandomForestClassifier(max_depth=30, random_state=0)

In [276]:
"""
再给test洗一次(注意这里就报错了，因为有某个label在这个test里只出现一次) 
"""
psx_test = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(
    temp_test_X, temp_test_Y, clf=RandomForestClassifier(max_depth=30, random_state=0))
psx_test = np.asarray(psx_test)
ordered_label_errors_test = get_noise_indices(
    s=temp_test_Y,
    psx=psx_test,
    sorted_index_method='normalized_margin', # Orders label errors
 )
cp_index_list_test = []
for i in index_list_test:
    if i in ordered_label_errors_test:
        cp_index_list_test.append(False)
    else:
        cp_index_list_test.append(True)



ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

In [277]:
"""
于是不洗直接测试
可以参考这个链接里面有一个详细的置信学习的代码(一个具体例子)
https://github.com/cgnorthcutt/cleanlab/blob/master/examples/simplifying_confident_learning_tutorial.ipynb
"""
temp_test_Y_pred = rfc.predict(temp_test_X)
print(classification_report(temp_test_Y,temp_test_Y_pred))

              precision    recall  f1-score   support

           0       0.73      0.71      0.72       273
           1       0.26      0.35      0.30       100
           2       0.87      0.87      0.87       284
           4       0.00      0.00      0.00        26
           5       0.00      0.00      0.00         1

    accuracy                           0.69       684
   macro avg       0.37      0.38      0.38       684
weighted avg       0.69      0.69      0.69       684



  _warn_prf(average, modifier, msg_start, len(result))
