In [130]:
import pandas as pd
import numpy as np
from chinese_calendar import is_workday  # 工作日判断
import matplotlib
import matplotlib.pyplot as plt  # 绘图库
matplotlib.use('TkAgg')
import jieba  # 中文分词
import re  # 正则化
import os  # 读取文件
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split  # 划分数据集
from sklearn.preprocessing import MinMaxScaler  # 归一化处理
from gensim.models import Word2Vec  # 词嵌入
from gensim.models.word2vec import LineSentence
from transformers import BertTokenizer
#from wordcloud import WordCloud  # 绘制词云图
import shap
from transformers import AutoTokenizer, AutoModel
import lightgbm as lgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn import Linear, BCEWithLogitsLoss, MSELoss
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'



pd.set_option('display.max_columns', None)  # 显示全部列

df = pd.read_csv('accident_data_new1.csv',encoding='gbk')
# 删除多列缺失值
columns_to_check = ['location_type', 'weather', 'environment_condition', 'vehicle', 'impact', 'death_num', 'injury_num', 'duration_h', 'description']
df = df.dropna(subset=columns_to_check, how='any', axis=0)
# duration conversion
df['duration_h'] = pd.to_numeric(df['duration_h'], errors='coerce')
df['duration_min'] = pd.to_numeric(df['duration_min'], errors='coerce')
df = df.dropna(subset=['duration_h', 'duration_min'])
df['duration'] = df['duration_h'] * 60 + df['duration_min']
# duration outliers delete
Q1 = df['duration'].quantile(0.25)  # 第一四分位数
Q3 = df['duration'].quantile(0.75)  # 第三四分位数
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 0.5 * IQR

df_cleaned = df[(df['duration'] >= lower_bound) & (df['duration'] <= upper_bound)]
print("原始数据长度:", len(df))
print("去除异常值后的数据长度:", len(df_cleaned))

# time conversion
df_cleaned['date'] = pd.to_datetime(df_cleaned[['year', 'month', 'day']].astype(str).agg('-'.join, axis=1))
df_cleaned['time'] = df_cleaned['start_time'] + ':00'
df_cleaned['time'] = pd.to_timedelta(df_cleaned['time'])
df_cleaned['DateTime'] = df_cleaned['date'] + df_cleaned['time']
# weekday
df_cleaned['Weekday'] = df_cleaned['DateTime'].apply(is_workday)
df_cleaned['Weekday'] = df_cleaned['Weekday'].astype(int)
# infrastructure damage
df_cleaned['Infrastructure_damage'] = df_cleaned['description'].str.contains('有路产', case=False).astype(int)
# injury
df_cleaned['Injury'] = (df_cleaned['injury_num'] > 0).astype(int)
# death
df_cleaned['Death'] = (df_cleaned['death_num'] > 0).astype(int)
# vehicle_type
df_cleaned['Vehicle_type'] = (
    df_cleaned['vehicle'].str.contains("一型客车", case=False) &
    ~df_cleaned['vehicle'].str.contains('|'.join(["货车", "半挂", "皮卡"]), case=False)
).astype(int)
# vehicle_involved
def count_one_vehicle(text):
    count_one = text.count('一辆')
    has_and = '与' in text
    return 0 if count_one == 1 and not has_and else 1


df_cleaned['Vehicle_involved'] = df_cleaned['vehicle'].apply(count_one_vehicle)
# Pavement_condition
pavement_normal_conditions = ['A', 'D', 'E', 'F']
df_cleaned['Pavement_condition'] = np.where(df_cleaned['environment_condition'].isin(pavement_normal_conditions), 0, 1)
# Weather_condition
df_cleaned['Weather_condition'] = np.where(df_cleaned['weather'].isin(['晴', '阴']), 0, 1)
# Shoulder
df_cleaned['Shoulder'] = (
    df_cleaned['impact'].str.contains('|'.join(["应急车道", "不影响", "不占用", "收费站", "服务区"]), case=False) &
    ~df_cleaned['impact'].str.contains('|'.join(["和", "与", "行车道", "超车道", "第一", "第二", "1", "2", "3", "4"]), case=False)
).astype(int)
# Burning
df_cleaned['Burning'] = df_cleaned['description'].str.contains('|'.join(['自燃', '燃烧', '火情', '起火']), case=False).astype(int)
# Rollover
df_cleaned['Rollover'] = df_cleaned['description'].str.contains('侧翻', case=False).astype(int)
# Night_hours
df_cleaned['DateTime'] = pd.to_datetime(df_cleaned['DateTime'])
df_cleaned['Night_hours'] = ((df_cleaned['DateTime'].dt.hour >= 20) | (df_cleaned['DateTime'].dt.hour < 6)).astype(int)
# Peak_hours
df_cleaned['Peak_hours'] = ((df_cleaned['DateTime'].dt.hour >= 6) & (df_cleaned['DateTime'].dt.hour < 9) |
                    (df_cleaned['DateTime'].dt.hour >= 17) & (df_cleaned['DateTime'].dt.hour < 20)).astype(int)
# Ramp
df_cleaned['Ramp'] = (df_cleaned['location_type'].str.contains('D')).astype(int)
# drop unrelated columns
accident_data = df_cleaned.drop(columns=['year', 'month', 'day', 'start_time', 'location_type', 'weather', 'direction', 'environment_condition',
                                         'event_type', 'vehicle', 'accident_type', 'impact_location', 'impact',
                                         'death_num', 'injury_num', 'end_time', 'duration_h', 'duration_min',
                                         'description', 'description_early', 'time', 'date', 'DateTime'])
# description preprocess (text)
# 加载自定义词典
dict_folder = 'dict/'
# 遍历文件夹中的所有文件
for filename in os.listdir(dict_folder):
    if filename.endswith('.txt'):  # 确保只加载.txt文件
        dict_path = os.path.join(dict_folder, filename)
        jieba.load_userdict(dict_path)  # 加载每个词典文件

# 加载停用词列表
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())
# 去除车牌号
license_plate_pattern = re.compile(r'[京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领A-Z]{1}[A-Z]{1}[A-Z0-9]{4,5}[A-Z0-9挂学警港澳]{1}')

def clean_chinese_text(text):
    text = text.replace('至', '')  # 删除“至”
    text = text.replace('接', '')  # 删除“接”
    text = text.replace('及', '')  # 删除“及”
    text_without_to = text.replace('冀', '')  # 删除“冀”
    text_without_license = license_plate_pattern.sub('', text_without_to)  # 删除车牌
    text_without_numbers_and_letters = re.sub(r'[^\u4e00-\u9fa5]', '', text_without_license)  # 删除桩号

    tokens = jieba.lcut(text_without_numbers_and_letters, cut_all=False)  # 分词

    # 去除停用词
    tokens = [token for token in tokens if token not in stopwords]

    return " ".join(tokens)
#accident_data['description_early1'] = accident_data['description_early1'].apply(clean_chinese_text)
with open('all_sentences.txt', 'w', encoding='utf-8') as f:
    for item in accident_data['description_early1']:
        f.write(item + '\n')
##################################################
categorical_columns = ['Weekday', 'Infrastructure_damage', 'Injury', 'Death', 'Vehicle_type', 'Vehicle_involved',
                      'Pavement_condition', 'Weather_condition', 'Shoulder', 'Burning', 'Rollover', 'Night_hours',
                      'Peak_hours', 'Ramp']
duration = accident_data.pop('duration')

# 划分训练集、验证集与测试集
train_val_data, test_data, train_val_duration, test_duration = train_test_split(accident_data, duration, test_size=0.15, random_state=42, shuffle=True)
train_data, val_data, train_duration, val_duration = train_test_split(train_val_data, train_val_duration, test_size=0.15, random_state=42, shuffle=True)

原始数据长度: 11211
去除异常值后的数据长度: 9246


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['date'] = pd.to_datetime(df_cleaned[['year', 'month', 'day']].astype(str).agg('-'.join, axis=1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['time'] = df_cleaned['start_time'] + ':00'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['time'] = pd.to_timedelta(df_cle

In [131]:
train_text = train_data["description_early1"].tolist()
test_text = test_data["description_early1"]

In [132]:
train_text

['日14:55青银高速绥吴段吴堡方向吴堡至辛家沟之间K1059＋950处发生交通事故(一辆二轴货车与一辆小轿车发生碰撞,造成1人受轻伤,有路产损失),占用应急车道',
 '17:52西安外环高速上行方向太平至店张之间马庄北立交K67+450处一辆牌照为陕AAH550的两轴货车（运输蔬菜）撞中央隔离带，无人员伤亡，有路产损失，事故占用第一车道，交警、路政已在现场处理',
 '23:08榆佳高速佳县方向阳宽草湾停车区至王家砭收费站之间K36处发生交通事故,一辆六轴货车轮胎着火，无人员受伤，占用应急车道,王家砭收费站入口封闭',
 '日11:15机场专用高速汉城方向机场至马家堡之间K13+600处一辆一型客车撞护栏,无人员伤亡,有路产损失,占用应急车道',
 '17:05报送：榆佳高速佳县方向王家贬收费站附近K44处发生交通事故（一辆六轴货车与一辆一型客车剐蹭，无人员伤亡，无路产损失），不影响道路通行',
 '19:19西安外环高速外环方向王莽立交转西镇高速西安方向匝道K26+270处发生交通事故，一辆二轴货车撞护栏，无人员伤亡，有路产损失，占用一个行车道. 19:35交通事故已处理完毕，道路恢复正常通行',
 '18:40报送：合铜高速宝鸡方向白水北至冯原之间K620处发生交通事故，一辆一型客车和一辆二轴货车追尾，无人员受伤，占用超车道、行车道',
 '09:25报送：沪陕高速商界段西安方向金丝峡至丹凤之间K1339+080处发生交通事故（一辆一型客车撞护栏，无人员受伤，有路产损失',
 '19:49连霍高速西潼段K3+430潼关方向（渭南东至渭南西）发生2车追尾交通事故，陕AX3112绿色5轴板车（拉运挖机），后车二轴陕U177U9箱式货车（拉运水果）被困司机已救出，被120已拉走，经120确认，腿部受伤，无生命危险，现场已设置警示标志，一车道正常通行，路产损失待确认',
 '19:17路段上报，连霍高速西宝段宝鸡方向武功之杨凌之间K1136+450处发生交通事故(两辆二轴货车剐蹭,无人员伤亡,无路产损失),占用两个行车道和应急车道',
 '日18:40西安外环高速渭南方向太平至泾阳北之间下行K61+800处发生交通事故(一辆小轿车撞护栏,无人员伤亡,有路产损失),占用超车道,不影响通行',
 '11:25京昆高速西禹段西安方向荆姚至富平之间K1009+000m处（

In [133]:
from transformers import AutoTokenizer, AutoModel, BertTokenizer

In [134]:
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

In [135]:
tokenizer

BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [136]:
import lightgbm as lgb

In [137]:
encoded_texts = []

for text in train_text:
    encoded_text = tokenizer.encode(
        text, 
        padding="max_length", 
        max_length=500, 
        truncation=True
    )
    encoded_texts.append(encoded_text)

In [138]:
model = lgb.LGBMRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
model.fit(encoded_texts, train_duration)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23625
[LightGBM] [Info] Number of data points in the train set: 6680, number of used features: 174
[LightGBM] [Info] Start training from score 68.896856


In [139]:
def f(x):
    print(x)
    encoded_data = [
    tokenizer.encode(v, padding="max_length", max_length=500, truncation=True)
    for v in x
]
    #predict = model.predict(np.array(encoded_data))
    predict = model(np.array(encoded_data))
    #print(type(predict))
    return predict

In [140]:
import shap

In [141]:
explainer = shap.Explainer(f, tokenizer)

In [142]:
shap_values = explainer(train_text[:10], fixed_context=1)

['[MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK]']


TypeError: 'LGBMRegressor' object is not callable

In [110]:
shap.plots.text(shap_values[1])

In [78]:
file = open('temp.html','w')
file.write(shap.plots.text(shap_values[1], display=False))
file.close

<function TextIOWrapper.close()>