In [None]:
import pandas as pd
import numpy as np
from chinese_calendar import is_workday  # 工作日判断
import matplotlib
import matplotlib.pyplot as plt  # 绘图库
matplotlib.use('TkAgg')
import jieba  # 中文分词
import re  # 正则化
import os  # 读取文件
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split  # 划分数据集
from sklearn.preprocessing import MinMaxScaler  # 归一化处理
from gensim.models import Word2Vec  # 词嵌入
from gensim.models.word2vec import LineSentence
from transformers import BertTokenizer, BertModel
#from wordcloud import WordCloud  # 绘制词云图
import shap
from transformers import AutoTokenizer, AutoModel
import lightgbm as lgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn import Linear, BCEWithLogitsLoss, MSELoss
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments
import torch.nn.functional as F
from datetime import datetime

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'



pd.set_option('display.max_columns', None)  # 显示全部列

df = pd.read_csv('accident_data_new1.csv',encoding='gbk')
# 删除多列缺失值
columns_to_check = ['location_type', 'weather', 'environment_condition', 'vehicle', 'impact', 'death_num', 'injury_num', 'duration_h', 'description']
df = df.dropna(subset=columns_to_check, how='any', axis=0)
# duration conversion
df['duration_h'] = pd.to_numeric(df['duration_h'], errors='coerce')
df['duration_min'] = pd.to_numeric(df['duration_min'], errors='coerce')
df = df.dropna(subset=['duration_h', 'duration_min'])
df['duration'] = df['duration_h'] * 60 + df['duration_min']
# duration outliers delete
Q1 = df['duration'].quantile(0.25)  # 第一四分位数
Q3 = df['duration'].quantile(0.75)  # 第三四分位数
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_cleaned = df[(df['duration'] >= lower_bound) & (df['duration'] <= upper_bound)]
print("原始数据长度:", len(df))
print("去除异常值后的数据长度:", len(df_cleaned))

# time conversion
df_cleaned['date'] = pd.to_datetime(df_cleaned[['year', 'month', 'day']].astype(str).agg('-'.join, axis=1))
df_cleaned['time'] = df_cleaned['start_time'] + ':00'
df_cleaned['time'] = pd.to_timedelta(df_cleaned['time'])
df_cleaned['DateTime'] = df_cleaned['date'] + df_cleaned['time']
# weekday
df_cleaned['Weekday'] = df_cleaned['DateTime'].apply(is_workday)
df_cleaned['Weekday'] = df_cleaned['Weekday'].astype(int)
# infrastructure damage
df_cleaned['Infrastructure_damage'] = df_cleaned['description'].str.contains('有路产', case=False).astype(int)
# injury
df_cleaned['Injury'] = (df_cleaned['injury_num'] > 0).astype(int)
# death
df_cleaned['Death'] = (df_cleaned['death_num'] > 0).astype(int)
# vehicle_type
df_cleaned['Vehicle_type'] = (
    df_cleaned['vehicle'].str.contains("一型客车", case=False) &
    ~df_cleaned['vehicle'].str.contains('|'.join(["货车", "半挂", "皮卡"]), case=False)
).astype(int)
# vehicle_involved
def count_one_vehicle(text):
    count_one = text.count('一辆')
    has_and = '与' in text
    return 0 if count_one == 1 and not has_and else 1


df_cleaned['Vehicle_involved'] = df_cleaned['vehicle'].apply(count_one_vehicle)
# Pavement_condition
pavement_normal_conditions = ['A', 'D', 'E', 'F']
df_cleaned['Pavement_condition'] = np.where(df_cleaned['environment_condition'].isin(pavement_normal_conditions), 0, 1)
# Weather_condition
df_cleaned['Weather_condition'] = np.where(df_cleaned['weather'].isin(['晴', '阴']), 0, 1)
# Shoulder
df_cleaned['Shoulder'] = (
    df_cleaned['impact'].str.contains('|'.join(["应急车道", "不影响", "不占用", "收费站", "服务区"]), case=False) &
    ~df_cleaned['impact'].str.contains('|'.join(["和", "与", "行车道", "超车道", "第一", "第二", "1", "2", "3", "4"]), case=False)
).astype(int)
# Burning
df_cleaned['Burning'] = df_cleaned['description'].str.contains('|'.join(['自燃', '燃烧', '火情', '起火']), case=False).astype(int)
# Rollover
df_cleaned['Rollover'] = df_cleaned['description'].str.contains('侧翻', case=False).astype(int)
# Night_hours
df_cleaned['DateTime'] = pd.to_datetime(df_cleaned['DateTime'])
df_cleaned['Night_hours'] = ((df_cleaned['DateTime'].dt.hour >= 20) | (df_cleaned['DateTime'].dt.hour < 6)).astype(int)
# Peak_hours
df_cleaned['Peak_hours'] = ((df_cleaned['DateTime'].dt.hour >= 6) & (df_cleaned['DateTime'].dt.hour < 9) |
                    (df_cleaned['DateTime'].dt.hour >= 17) & (df_cleaned['DateTime'].dt.hour < 20)).astype(int)
# Ramp
df_cleaned['Ramp'] = (df_cleaned['location_type'].str.contains('D')).astype(int)
# drop unrelated columns
accident_data = df_cleaned.drop(columns=['year', 'month', 'day', 'start_time', 'location_type', 'weather', 'direction', 'environment_condition',
                                         'event_type', 'vehicle', 'accident_type', 'impact_location', 'impact',
                                         'death_num', 'injury_num', 'end_time', 'duration_h', 'duration_min',
                                         'description_early', 'description', 'time', 'date', 'DateTime'])
# # description preprocess (text)
# # 加载自定义词典
# dict_folder = 'dict/'
# # 遍历文件夹中的所有文件
# for filename in os.listdir(dict_folder):
#     if filename.endswith('.txt'):  # 确保只加载.txt文件
#         dict_path = os.path.join(dict_folder, filename)
#         jieba.load_userdict(dict_path)  # 加载每个词典文件

# # 加载停用词列表
# with open('stopwords.txt', 'r', encoding='utf-8') as f:
#     stopwords = set(f.read().splitlines())
# # 去除车牌号
# license_plate_pattern = re.compile(r'[京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领A-Z]{1}[A-Z]{1}[A-Z0-9]{4,5}[A-Z0-9挂学警港澳]{1}')

# def clean_chinese_text(text):
#     text = text.replace('至', '')  # 删除“至”
#     text = text.replace('接', '')  # 删除“接”
#     text = text.replace('及', '')  # 删除“及”
#     text_without_to = text.replace('冀', '')  # 删除“冀”
#     text_without_license = license_plate_pattern.sub('', text_without_to)  # 删除车牌
#     text_without_numbers_and_letters = re.sub(r'[^\u4e00-\u9fa5]', '', text_without_license)  # 删除桩号

#     tokens = jieba.lcut(text_without_numbers_and_letters, cut_all=False)  # 分词

#     # 去除停用词
#     tokens = [token for token in tokens if token not in stopwords]

#     return " ".join(tokens)

def remove_dates_from_texts(texts):
    # 删除日期
    date_pattern = r'\b\d{4}(?:年|\s)?(?:0?[1-9]|1[0-2])(?:月|\s)?(?:0?[1-9]|[12][0-9]|3[01])(?:日|\b)|\b(?:0?[1-9]|1[0-2])(?:月|\s)(?:0?[1-9]|[12][0-9]|3[01])日\b'
    return re.sub(date_pattern, '', texts).strip()

def remove_1_from_texts(texts):
    # 匹配 '- 10:20', '-10:20', ' 10:20', '10:20' 等形式
    time_split_pattern = r'(-\s*(\d{1,2}:\d{2}|\d{1,2}\.\d{2}))'  # 匹配时间范围的后半部分  # 匹配时间范围的后半部分及之后的任何内容
    return re.sub(time_split_pattern, '', texts).strip()

def process_text_with_times(text):
    # 使用正则表达式提取所有的时间信息
    pattern = r'(\d+:\d+)'
    matches = re.findall(pattern, text)

    if not matches:
        return text

    try:
        # 转换时间
        times = [datetime.strptime(match, '%H:%M') for match in matches]
        base_time = times[0]

        # 计算与第一个时间的分钟差，同时处理跨天情况
        time_differences = []
        for time in times:
            raw_diff = (time - base_time).total_seconds() // 60
            # 处理跨天情况
            if raw_diff < 0:
                # 计算从24:00到base_time，再从24:00到当前时间的总分钟数
                minutes_to_midnight = (24 * 60) - (base_time.hour * 60 + base_time.minute)
                # 从00:00到time的分钟数
                minutes_from_midnight = time.hour * 60 + time.minute
                diff = minutes_to_midnight + minutes_from_midnight
            else:
                diff = raw_diff
            time_differences.append(diff)

        # 构建替换逻辑，第一个时间映射为'0min'，其余时间为与第一个时间的分钟差
        replacements = {match: (f"{diff}min" if i > 0 else "0min") for i, (match, diff) in
                        enumerate(zip(matches, time_differences))}

    except ValueError as e:
        print(f"Error processing time: {e}")
        return text

    # 这里省略了文本替换逻辑的具体实现，因为直接在原始代码上修改并添加跨天处理是主要目的
    for match, replacement in replacements.items():
        text = re.sub(re.escape(match), replacement, text, count=1)

    return text
#ccident_data['description_early1'] = accident_data['description_early1'].apply(clean_chinese_text)
##################################################

accident_data['description_early1'] = accident_data['description_early1'].apply(remove_dates_from_texts)
accident_data['description_early1'] = accident_data['description_early1'].apply(remove_1_from_texts)
#accident_data['description'] = accident_data['description'].apply(process_text_with_times)
categorical_columns = ['Weekday', 'Infrastructure_damage', 'Injury', 'Death', 'Vehicle_type', 'Vehicle_involved',
                      'Pavement_condition', 'Weather_condition', 'Shoulder', 'Burning', 'Rollover', 'Night_hours',
                      'Peak_hours', 'Ramp']
duration = accident_data.pop('duration')

# 划分训练集、验证集与测试集
train_val_data, test_data, train_val_duration, test_duration = train_test_split(accident_data, duration, test_size=0.15, random_state=42, shuffle=True)
train_data, val_data, train_duration, val_duration = train_test_split(train_val_data, train_val_duration, test_size=0.15, random_state=42, shuffle=True)

In [None]:
train_duration_log = np.log(train_duration.values)
val_duration_log = np.log(val_duration.values)
test_duration_log = np.log(test_duration.values)
train_duration = train_duration_log
val_duration = val_duration_log
test_duration = test_duration_log
# scaler = MinMaxScaler()
# train_duration_norm = scaler.fit_transform(train_duration.values.reshape(-1, 1))
# val_duration_norm = scaler.transform(val_duration.values.reshape(-1, 1))
# test_duration_norm = scaler.transform(test_duration.values.reshape(-1, 1))
# train_duration = train_duration_norm.squeeze()
# val_duration = val_duration_norm.squeeze()
# test_duration = test_duration_norm.squeeze()

In [None]:
train_data_text = train_data.pop("description_early1")
val_data_text = val_data.pop("description_early1")
test_data_text = test_data.pop("description_early1")

In [None]:
class AccidentsDataset(Dataset):
    def __init__(self, accident_descriptions, durations, tokenizer, max_length=128):
        self.accident_descriptions = list(accident_descriptions)
        #self.cat_data = cat_data
        self.durations = list(durations)  # Convert to lists if they were pandas Series or DataFrames with indices
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.accident_descriptions)


    def __getitem__(self, index):
        #print(index)
        accident_descriptions = self.accident_descriptions[index]
        #cat_data = self.cat_data.iloc[index].values
        duration = self.durations[index]
        inputs = self.tokenizer(accident_descriptions, padding='max_length',
                                truncation=True, max_length=self.max_length, return_tensors='pt')
        #cat_data = torch.tensor(cat_data)
        target_duration = torch.tensor([duration], dtype=torch.float)

        return {
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0],
            #'cat_data': cat_data,
            'target_duration': target_duration
        }

In [None]:
tokenizer = BertTokenizer.from_pretrained('miniRBT')

In [None]:
train_dataset = AccidentsDataset(train_data_text, train_duration, tokenizer, max_length=128)
val_dataset = AccidentsDataset(val_data_text, val_duration, tokenizer, max_length=128)
test_dataset = AccidentsDataset(test_data_text, test_duration, tokenizer, max_length=128)

In [None]:
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

In [None]:
class BertDurationRegressor(nn.Module):
    def __init__(self,  out_features=1):
        super().__init__()
        self.bert_hidden_dim = bert_hidden_dim = 256
        self.dense_size = dense_size = 128
        self.dropout_rate = dropout_rate = 0.1
        self.dropout = nn.Dropout(dropout_rate)
        self.bert = BertModel.from_pretrained('miniRBT')
        for param in self.bert.parameters():
            param.requires_grad = True
        self.fc1 = nn.Linear(in_features=bert_hidden_dim,
                             out_features=dense_size
                             )
        self.regression_layer = nn.Sequential(
            nn.Linear(dense_size, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 1),
        )


    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        #pooled_sequence = outputs.last_hidden_state.mean(dim=1)
        #pooled_sequence = self.dropout(pooled_sequence)
        cls_token_output = outputs.last_hidden_state[:, 0, :]
        text_features = torch.relu(self.fc1(cls_token_output))
        #combined_features = torch.cat((text_features, categorical_features), dim=1)
        out = self.regression_layer(text_features)
        return out

In [None]:
# class BertTextRegression(nn.Module):
#     def __init__(self, hidden_size, output_size, dropout=0.1):
#         super().__init__()
#         self.output_size = output_size
#         self.dropout = dropout
#         #self.dense_size = categorical_feature_size

#         # Use pre-trained BERT model
#         self.bert = BertModel.from_pretrained('miniRBT', output_hidden_states=True, output_attentions=True)
#         for param in self.bert.parameters():
#             param.requires_grad = True
#         self.weights = nn.Parameter(torch.rand(13, 1))
#         self.dropout = nn.Dropout(dropout)
#         self.fc1 = nn.Linear(hidden_size, output_size)
#         #self.fc2 = nn.Linear(dense_size + categorical_feature_size, output_size)

#     def forward(self, input_ids):
#         all_hidden_states, all_attentions = self.bert(input_ids)[-2:]
#         batch_size = input_ids.shape[0]
#         ht_cls = torch.cat(all_hidden_states)[:, :1, :].view(13, batch_size, 1, 768)
#         atten = torch.sum(ht_cls * self.weights.view(13, 1, 1, 1), dim=[1, 3])
#         atten = F.softmax(atten.view(-1), dim=0)
#         feature = torch.sum(ht_cls * atten.view(13, 1, 1, 1), dim=[0, 2])
#         out = self.fc1(self.dropout(feature))

#         return out

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertDurationRegressor()
model.to(device)

In [None]:
from torch.optim.lr_scheduler import CosineAnnealingLR 

In [None]:
criterion = nn.MSELoss()
lr = 0.0002
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
#optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
epochs = 20
patience = 5  # 早停epoch设置
no_improvement_count = 0
scheduler = CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-6)

In [None]:
def train_epoch(model, data_loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target_duration'].to(device)  #
        outputs = model(input_ids, attention_mask)  # 假设数据结构与之前一致
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * input_ids.size(0)
    return total_loss / len(data_loader.dataset)
def val_epoch(model, data_loader, criterion):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target_duration'].to(device)  #
            outputs = model(input_ids, attention_mask)  # 假设数据结构与之前一致
            loss = criterion(outputs, targets)
            total_loss += loss.item() * input_ids.size(0)
    return total_loss / len(data_loader.dataset)

In [None]:
best_val_loss = float('inf')
train_losses = []
val_losses = []

for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss = val_epoch(model, val_loader, criterion)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    scheduler.step()
    torch.save(model.state_dict(), 'MiniRBT_initialmsg_shap.pth')
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         torch.save(model.state_dict(), 'miniRBT_regressor_20epoch.pth')
#         no_improvement_count = 0
#     else:
#         no_improvement_count += 1
#         if no_improvement_count >= patience:
#             print(f'Early stopping triggered at epoch {epoch}. No improvement in validation loss for {patience} epochs.')
#             break
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
print('----------loading model---------------')
model.load_state_dict(torch.load('MiniRBT_initialmsg_shap.pth'))
model.eval()
test_preds = []
test_labels = []
with torch.no_grad():
    for batch in test_loader:
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target_duration'].to(device)
        outputs = model(inputs, attention_mask)
        test_preds.extend(outputs.cpu().detach().numpy())
        test_labels.extend(targets.cpu().detach().numpy())


def calculate_metrics(predictions, targets):
    rmse = np.sqrt(mean_squared_error(targets, predictions))
    mae = mean_absolute_error(targets, predictions)
    mape = np.mean(np.abs((targets - predictions) / targets)) * 100  # 注意处理除零的情况
    return rmse, mae, mape


# 转换为NumPy数组
# test_preds = np.array(outputs)
# test_labels = np.array(test_labels)
test_preds = np.concatenate(test_preds, axis=0)  # 将列表转换为单个NumPy数组
test_labels = np.concatenate(test_labels, axis=0)
# predictions_original_scale = np.exp(test_preds)
# actuals_original_scale = np.exp(test_labels)
predictions_original_scale = scaler.inverse_transform(test_preds.reshape(-1, 1))
actuals_original_scale = scaler.inverse_transform(test_labels.reshape(-1, 1))

# 计算指标
rmse, mae, mape = calculate_metrics(predictions_original_scale, actuals_original_scale)
print(f"Test Set Metrics: RMSE = {rmse:.4f}, MAE = {mae:.4f}, MAPE = {mape:.4f}%")

In [None]:
train_text = train_data_text.tolist()
test_text = test_data_text.tolist()

In [None]:
model = BertDurationRegressor() # 确保使用与保存模型时相同的参数初始化模型
model.to(device)  # 如果需要，将模型转移到特定设备

# 加载模型状态字典
model_path = "MiniRBT_20epoch_text_fullmsg.pth"
#model.load_state_dict(torch.load(model_path))
model.load_state_dict(torch.load(model_path, map_location=device))

In [None]:
# def f(x):
#     #print(x)
#     tv = torch.tensor(
#         [
#             tokenizer.encode(v, padding="max_length", max_length=128, truncation=True)
#             for v in x
#         ]
#     ).to(device)
#     attention_mask = (tv != 0).type(torch.int64).to(device)
#     #print(tv)
#     #print(attention_mask)
#     outputs = model(tv, attention_mask=attention_mask).detach().cpu().numpy()
#     #outputs = outputs.squeeze()
#     print(outputs)
#     return outputs

In [None]:
def f(x):
    #print(x)
    tv = torch.tensor(
        [
            tokenizer.encode(v, padding="max_length", max_length=128, truncation=True)
            for v in x
        ]
    ).to(device)
    attention_mask = (tv != 0).type(torch.int64).to(device)
    #print(tv)
    #print(attention_mask)
    outputs = model(tv, attention_mask=attention_mask).detach().cpu().numpy()
    outputs = scaler.inverse_transform(outputs)
    #outputs = outputs.squeeze()
    print(outputs)
    return outputs

In [None]:
explainer = shap.Explainer(f, tokenizer)
shap_values = explainer(train_text[:100], fixed_context=1, batch_size = 1)

In [None]:
shap.plots.text(shap_values[9])

In [None]:
print(shap_values.shape) 

In [None]:
shap_values_array.shape

In [None]:
from collections import defaultdict
texts = train_text[:100] 
# 初始化统计变量
char_contribution = defaultdict(float)
char_count = defaultdict(int)

# 遍历每个文本和对应的 SHAP 值
for shap_exp, text in zip(shap_values, texts):
    shap_values = shap_exp.values  # 提取 SHAP 值
    for i, char in enumerate(text):
        # 确保索引 i 在 SHAP 值数组的范围内
        if i < len(shap_values):
            char_contribution[char] += shap_values[i]
            char_count[char] += 1

# 计算每个字符的平均贡献
average_contribution = {char: char_contribution[char] / char_count[char] for char in char_contribution}

print(average_contribution)

In [None]:
# 将 average_contribution 的值转换为浮点数，并对其进行排序
average_contribution = {char: float(contrib) for char, contrib in average_contribution.items()}

# 对 average_contribution 按照贡献值进行排序，并获取前 10 个字符
top_chars = sorted(average_contribution.items(), key=lambda item: item[1], reverse=True)[:50]

# 打印前 10 个字符及其平均贡献
for char, avg_contrib in top_chars:
    print(f"字符: {char}, 平均贡献: {avg_contrib:.4f}")

In [None]:
import lightgbm as lgb

In [None]:
encoded_texts = []

for text in train_text:
    encoded_text = tokenizer.encode(
        text, 
        padding="max_length", 
        max_length=128, 
        truncation=True
    )
    encoded_texts.append(encoded_text)

In [None]:
encoded_texts[0]

In [None]:
lgb_model = lgb.LGBMRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
lgb_model.fit(encoded_texts, train_duration)

In [None]:
def f(x):
    encoded_data = [
    tokenizer.encode(v, padding="max_length", max_length=128, truncation=True)
    for v in x
]
    predict = lgb_model.predict(np.array(encoded_data))
    return predict

In [None]:
explainer = shap.Explainer(f, tokenizer)

In [None]:
shap_values = explainer(['青银高速绥定段山西方向子洲至绥德之间K1118+090处发生交通事故一辆六轴半挂货车撞护栏（装载货物：石料）无人员伤亡有路产损失占用行车道和应急车道'], fixed_context=1)

In [None]:
shap.plots.text(shap_values[0])

In [None]:
file = open('temp.html','w')
file.write(shap.plots.text(shap_values[0], display=False))
file.close

In [None]:
shap_values = explainer(train_text[:100], fixed_context=1)

In [None]:
train_text[96]

In [None]:
shap.plots.text(shap_values[96])

In [None]:
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用黑体显示中文
plt.rcParams['axes.unicode_minus'] = False  # 正常显示负号

In [None]:
shap.plots.bar(shap_values[0:])