### 依赖区

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import re
import jieba
import jieba.posseg as pseg
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

### 1.数据清洗 ———— 删除空值、重复值、去除无意义标点符号

In [2]:
# 读取CSV文件
df = pd.read_csv('1.csv')#需要进行情感分析的聊天记录文档

# 删除具有缺失值的行
df = df.dropna()

In [3]:
# 本地 csv 文档路径
csv_path = '1.csv'
# 待分词的 csv 文件中的列
document_column = 'review'
label_column = 'label'
pattern = u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!\t"@#$%^&*\\-_=+，。\n《》、？：；“”‘’｛｝【】（）…￥！—┄－]+'


df = pd.read_csv(csv_path, encoding='utf-8')

# 仅保留 "label" 和 "review" 两列
df = df[[label_column, document_column]]

# 删除 review 列缺失的行
df = df[pd.notna(df[document_column])]

df = df.drop_duplicates()
df = df.rename(columns={
    document_column: 'text'
})
df = df.rename(columns={
    label_column: 'label'
})
df['cut'] = df['text'].apply(lambda x: str(x))
df['cut'] = df['cut'].apply(lambda x: re.sub(pattern, ' ', x))
df['cut'] = df['cut'].apply(lambda x: " ".join(jieba.lcut(x)))
print(df['cut'])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\86182\AppData\Local\Temp\jieba.cache
Loading model cost 0.446 seconds.
Prefix dict has been built successfully.


0       ﻿ 更博 了   爆照 了   帅 的 呀   就是 越来越 爱 你   生快 傻 缺   ...
1         张晓鹏 jonathan   土耳其 的 事要 认真对待   哈哈   否则 直接 开除...
2       姑娘 都 羡慕 你 呢   还有 招财猫 高兴   爱 在 蔓延   JC   哈哈   小...
3                                               美   爱 你  
4                            梦想 有 多 大   舞台 就 有 多 大   鼓掌  
                              ...                        
1124    一 公里 不到   县 医院 那个 天桥 下右 拐   米 就 到 了   谢礼 恒   我...
1125    今天 真冷 啊   难道 又 要 穿 棉袄 了   晕   今年 的 春天 真的 是 百变 ...
1126                                最近 几天 就 没 停止 过   伤心  
1127                                    毒药 女流氓   怒   很惨  
1128    呢   杰   Kelena   抓狂   搞 乜 鬼   想知   入去 GOtrip 睇...
Name: cut, Length: 1129, dtype: object


### 删去停用词

In [4]:
from nltk.stem import PorterStemmer

# 读取停用词表
def load_stopwords(stopwords_file):
    stopwords = set()
    with open(stopwords_file, 'r', encoding='utf-8') as f:
        for line in f:
            stopwords.add(line.strip())
    return stopwords

# 分词结果
cut_words = df['cut'].tolist()

# 加载停用词表
stopwords = load_stopwords("baidu_stopwords.txt")

# 初始化词干提取器
stemmer = PorterStemmer()

# 去除停用词并进行词干提取
filtered_words = []
for words in cut_words:
    filtered_words.append([stemmer.stem(word) for word in words.split() if word not in stopwords])

# 将去除停用词并进行词干提取后的结果更新到DataFrame中
df['filtered_cut_stemmed'] = filtered_words

df['cut1'] = [' '.join(words) for words in df['filtered_cut_stemmed']]

# 输出结果
print(df['cut1'])

0                            ﻿ 更博 爆照 帅 越来越 爱 生快 傻 缺 爱 爱 爱
1        张晓鹏 jonathan 土耳其 事要 认真对待 开除 丁丁 看 世界 很 细心 酒店 都 ok
2       姑娘 都 羡慕 招财猫 爱 蔓延 jc 小 学徒 一枚 明天 见 李欣芸 sharonle ...
3                                                     美 爱
4                                            梦想 大 舞台 大 鼓掌
                              ...                        
1124    公里 不到 县 医院 天桥 下右 拐 米 谢礼 恒 太 霸道 好 远 古倒 吃 点 真心 找...
1125                          真冷 难道 穿 棉袄 晕 春天 真的 百变 莫测 抓狂
1126                                           几天 没 停止 伤心
1127                                          毒药 女流氓 怒 很惨
1128    杰 kelena 抓狂 搞 乜 鬼 想知 入去 gotrip 睇 睇 http t cn a...
Name: cut1, Length: 1129, dtype: object


### 词性标注（中文）

In [5]:
#词性标注（中文）
def pos_tagging(text):
    words = pseg.cut(text)
    tagged_words = [(word.word, word.flag) for word in words if word.flag.startswith('n') or word.flag.startswith('v') or word.flag.startswith('a')]
    return tagged_words

df['cut2'] = df['cut1'].apply(pos_tagging)
print(df['cut2'])

0       [(博, v), (爆照, v), (帅, nr), (爱, v), (生快, v), (傻...
1       [(张晓鹏, nr), (土耳其, ns), (事, n), (要, v), (开除, v)...
2       [(姑娘, n), (羡慕, v), (招财猫, nr), (爱, v), (蔓延, v),...
3                                       [(美, ns), (爱, v)]
4             [(梦想, n), (大, a), (舞台, n), (大, a), (鼓掌, v)]
                              ...                        
1124    [(不到, v), (医院, n), (天桥, ns), (谢礼, nr), (恒, nr)...
1125    [(真冷, a), (棉袄, n), (晕, v), (百变, nz), (莫测, nr),...
1126                           [(没, v), (停止, v), (伤心, n)]
1127                [(毒药, n), (女流氓, n), (怒, vg), (很惨, a)]
1128    [(杰, nr), (抓狂, v), (搞, v), (乜, nr), (鬼, n), (想...
Name: cut2, Length: 1129, dtype: object


In [6]:
#在词性标注之后删去非中文部分、括号、每行开头和末尾的空格
def clean_tagged_words(tagged_words): 
    cleaned_words = [re.sub(r'[^[\u4e00-\u9fa5 ]]','', word[0]).strip() 
    for word in tagged_words] 
    return cleaned_words

df['cut2'] = df['cut2'].apply(clean_tagged_words) 
df['cut2'] = df['cut2'].apply(lambda x: " ".join(x))
print(df['cut2'])

0                                   博 爆照 帅 爱 生快 傻 缺 爱 爱 爱
1                            张晓鹏 土耳其 事 要 开除 丁丁 看 世界 细心 酒店
2                     姑娘 羡慕 招财猫 爱 蔓延 小 学徒 见 李欣芸 大佬 范儿 书呆子
3                                                     美 爱
4                                            梦想 大 舞台 大 鼓掌
                              ...                        
1124    不到 医院 天桥 谢礼 恒 霸道 好 远 古 倒 吃 找 吃 泪 敏 嘴 记到 去 吃 吃 ...
1125                                     真冷 棉袄 晕 百变 莫测 抓狂
1126                                              没 停止 伤心
1127                                          毒药 女流氓 怒 很惨
1128                                 杰 抓狂 搞 乜 鬼 想知 入去 睇 睇
Name: cut2, Length: 1129, dtype: object


### 对数据集进行观察 构造TF-IDF

In [7]:
tf_idf_vectorizer = TfidfVectorizer()
tf_idf = tf_idf_vectorizer.fit_transform(df['cut2'])

In [8]:
# tf_idf

In [9]:
# 特征词列表
feature_names = tf_idf_vectorizer.get_feature_names_out()
# 特征词 TF-IDF 矩阵
tfidf_matrix = tf_idf.toarray()
feature_names_df = pd.DataFrame(tfidf_matrix,columns=feature_names)

In [10]:
feature_names

array(['一体', '一厂', '一品堂', ..., '龙行', '龙门', '龟苓膏'], dtype=object)

In [11]:
feature_names_df

Unnamed: 0,一体,一厂,一品堂,一块钱,一大,一家人,一景,一楼,一流,一盘菜,...,龙之梦,龙儿,龙妈,龙洋,龙湖,龙猫,龙珠果,龙行,龙门,龟苓膏
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2.词向量word2vec

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
import matplotlib.pyplot as plt

# Text processing 将每个文本按照分词的预处理步骤进行处理，例如去除标点符号、转换为小写等。
texts = [simple_preprocess(text) for text in df['cut2']]

# Create a dictionary 根据处理后的文本集合texts创建词典，将每个单词映射为唯一的整数ID。
dictionary = Dictionary(texts)

# Create a bag-of-words corpus 将每个文本转换为词袋向量表示，即将文本中的每个单词映射为其在词典中的整数ID，并计算每个单词出现的频次。
corpus_vec = [dictionary.doc2bow(text) for text in texts]


In [13]:
from gensim.models import Word2Vec

# 训练 Word2Vec 模型
model = Word2Vec(sentences=texts, vector_size=100, window=5, min_count=1, workers=4)

In [14]:
import numpy as np

# Create a matrix to store the word embeddings
vector_size = model.vector_size
X = np.zeros((len(texts), vector_size))

# Convert each text into a vector representation using Word2Vec
for i, text in enumerate(texts):
    for word in text:
        if word in model.wv.key_to_index:
            X[i] += model.wv.get_vector(word)
    X[i] /= len(text)  # Normalize the vector by the number of words

  X[i] /= len(text)  # Normalize the vector by the number of words


In [15]:
X[i]

array([ 1.82401690e-03, -2.08525515e-03,  2.72756489e-03,  5.61258119e-03,
       -5.32728472e-03, -4.44843744e-04,  1.26467148e-03,  6.89877430e-04,
        1.79825863e-03, -3.59690360e-04,  6.56268753e-03,  1.28225720e-03,
        3.11560836e-03,  4.37039253e-03,  4.75187873e-04, -5.21656382e-03,
        7.31776779e-04,  5.31359261e-03, -7.88579400e-03, -8.38938340e-03,
        2.50353167e-04,  1.67803237e-03,  3.18458630e-03,  1.08465680e-03,
        4.74671771e-03, -1.91148673e-03,  2.33494047e-03,  7.85078900e-03,
       -9.06837452e-03, -1.89110404e-03, -2.46525463e-03, -1.90839928e-03,
       -1.79513590e-03, -1.31911769e-03,  2.01542528e-03,  3.23949529e-03,
        7.05358076e-03, -4.44977637e-03, -1.96400052e-03, -2.14825672e-03,
       -6.03725641e-03,  1.09424489e-03, -2.02334969e-03, -2.29279418e-03,
       -2.91347319e-03,  1.42337578e-03, -3.99112422e-03, -1.15794347e-03,
        6.59697542e-03,  7.31098497e-03, -3.06219328e-03,  9.69508042e-04,
        1.36841360e-03,  

In [16]:
# Load the necessary libraries and packages
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import re
import jieba
import jieba.posseg as pseg
from nltk.stem.porter import PorterStemmer
import numpy as np  
import pandas as pd
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Update features (X) and labels (y) accordingly
X = df['cut2'].values
y = df['label'].values

print("X:", X)
print("Y:", y)

X: ['博 爆照 帅 爱 生快 傻 缺 爱 爱 爱' '张晓鹏 土耳其 事 要 开除 丁丁 看 世界 细心 酒店'
 '姑娘 羡慕 招财猫 爱 蔓延 小 学徒 见 李欣芸 大佬 范儿 书呆子' ... '没 停止 伤心' '毒药 女流氓 怒 很惨'
 '杰 抓狂 搞 乜 鬼 想知 入去 睇 睇']
Y: [1 1 1 ... 0 0 0]


### 训练word2vec

In [18]:
import numpy as np
import multiprocessing
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence
import keras
from gensim import corpora
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

cpu_count = multiprocessing.cpu_count()
vocab_dim = 100
n_iterations = 1
n_exposures = 10
window_size = 7
maxlen = 100
import numpy as np
import keras
from sklearn.model_selection import train_test_split

def get_data(w2indx, w2vec, data, y):
    n_symbols = len(w2indx) + 1  # 补上索引为0（频数小于10）的词
    embedding_weights = np.zeros((n_symbols, vocab_dim))  
    for word, index in w2indx.items(): 
        embedding_weights[index, :] = w2vec[word]
    x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2)
    y_train = keras.utils.to_categorical(y_train, num_classes=2)  # 转换为one-hot特征
    y_test = keras.utils.to_categorical(y_test, num_classes=2)
    return n_symbols, embedding_weights, x_train, y_train, x_test, y_test

def word2vec_train(data):
    """Train Word2Vec model
    Parameters
    ----------
    data : Segmented 2D list
    """
    model = Word2Vec(vector_size=vocab_dim, min_count=n_exposures, window=window_size, workers=cpu_count, epochs=n_iterations)
    #model = Word2Vec(size=vocab_dim, min_count=n_exposures, window=window_size, workers=cpu_count, iter=n_iterations)
    model.build_vocab(data)
    # model.train(data, total_examples=model.corpus_count, epochs=model.iter)
    model.train(data, total_examples=model.corpus_count, epochs=model.epochs)
    model.save('Word2vec_model.pkl')


def create_dictionaries(model=None, data=None):

    if (data is not None) and (model is not None):
        w2indx = {word: index + 1 for index, word in enumerate(model.wv.index_to_key)}
        f = open("word2index.txt", 'w', encoding='utf8')
        for key in w2indx:
            f.write(str(key))
            f.write(' ')
            f.write(str(w2indx[key]))
            f.write('\n')
        f.close()
        w2vec = {word: model.wv.get_vector(word) for word in w2indx}

        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data  # word => index

        data = parse_dataset(data)
        data = sequence.pad_sequences(data, maxlen=maxlen)
        return w2indx, w2vec, data
    else:
        print('Text is empty!')  


# Combine the segmented lists
X = df['cut2'].values
y = df['label'].values

# Train Word2Vec model
word2vec_train(X)

model = Word2Vec.load('Word2vec_model.pkl')

w2indx, w2vec, data = create_dictionaries(model=model, data=X)

# Get the data for training and testing
n_symbols, embedding_weights, x_train, y_train, x_test, y_test = get_data(w2indx, w2vec, data, y)



### 3.  七种模型的预测准确率（Naive Bayes、LR、SVM、KNN、DT、RF、LSTM）

In [20]:
def train_models(x_train, y_train, x_test, y_test):
    # Naive Bayes
    nb = MultinomialNB()
    nb.fit(x_train, y_train)
    nb_pred = nb.predict(x_test)
    nb_pred_flat = nb_pred.flatten()
    nb_pred_labels = np.where(nb_pred > 0.5, 1, 0)  
    nb_test_labels = np.argmax(y_test, axis=1)
    accuracy_nb = np.mean(nb_pred_labels == nb_test_labels)
    print("Naive Bayes Accuracy:", accuracy_nb)
    
    # Logistic Regression
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    lr_pred = lr.predict(x_test)
    lr_pred_flat = lr_pred.flatten()
    lr_pred_labels = np.where(lr_pred > 0.5, 1, 0)  
    lr_test_labels = np.argmax(y_test, axis=1)
    accuracy_lr = np.mean(lr_pred_labels == lr_test_labels)
    print("Logistic Regression Accuracy:", accuracy_lr)

    # Support Vector Machine
    svm = SVC()
    svm.fit(x_train, y_train)
    svm_pred = svm.predict(x_test)
    svm_pred_flat = svm_pred.flatten()
    svm_pred_labels = np.where(svm_pred > 0.5, 1, 0)  
    svm_test_labels = np.argmax(y_test, axis=1)
    accuracy_svm = np.mean(svm_pred_labels == svm_test_labels)
    print("Support Vector Machine Accuracy:", accuracy_svm)

    # K-Nearest Neighbors
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(x_train, y_train)
    knn_pred = knn.predict(x_test)
    knn_pred_flat = svm_pred.flatten()
    knn_pred_labels = np.where(knn_pred > 0.5, 1, 0)  
    knn_test_labels = np.argmax(y_test, axis=1)
    accuracy_knn = np.mean(knn_pred_labels == knn_test_labels)
    print("K-Nearest Neighbors Accuracy:", accuracy_knn)

    # Decision Tree
    dt = DecisionTreeClassifier()
    dt.fit(x_train, y_train)
    dt_pred = dt.predict(x_test)
    dt_pred_flat = dt_pred.flatten()
    dt_pred_labels = np.where(dt_pred > 0.5, 1, 0)  
    dt_test_labels = np.argmax(y_test, axis=1)
    accuracy_dt = np.mean(dt_pred_labels == dt_test_labels)
    print("Decision Tree Accuracy:", accuracy_dt)

    # Random Forest
    rf = RandomForestClassifier(n_estimators=10)
    rf.fit(x_train, y_train)
    rf_pred = rf.predict(x_test)
    rf_pred_flat = rf_pred.flatten()
    rf_pred_labels = np.where(rf_pred > 0.5, 1, 0)  
    rf_test_labels = np.argmax(y_test, axis=1)
    accuracy_rf = np.mean(rf_pred_labels == rf_test_labels)
    print("Random Forest Accuracy:", accuracy_rf)

    return

# Train and evaluate different models
train_models(x_train, y_train, x_test, y_test)

#总体来说预测结果较好的是：SVM、DT、RF。仍旧需要进一步比较，用K折交叉验证！！！

Naive Bayes Accuracy: 0.5796460176991151
Logistic Regression Accuracy: 0.5619469026548672
Support Vector Machine Accuracy: 0.5973451327433629
K-Nearest Neighbors Accuracy: 0.588495575221239
Decision Tree Accuracy: 0.6415929203539823
Random Forest Accuracy: 0.6283185840707964


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 使用K折交叉验证求取模型的平均预测准确率（朴素贝叶斯、LR、SVM、KNN、DT、RF、LSTM）

In [21]:
# from sklearn.model_selection import KFold
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# import numpy as np


def train_models_NaiveBayes(x, y, n_splits=10):
    kf = KFold(n_splits=n_splits)
    accuracies = []

    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        nb = MultinomialNB()
        nb.fit(x_train, y_train)
        nb_pred = nb.predict(x_test)

        nb_pred_labels = np.where(nb_pred > 0.5, 1, 0)
        accuracy_nb = np.mean(nb_pred_labels == y_test)
        accuracies.append(accuracy_nb)
        print("Naive Bayes Accuracy:", accuracy_nb)

    average_accuracy = np.mean(accuracies)
    print("Average Naive Bayes Accuracy:", average_accuracy)

# Assuming x_train, y_train, x_test, y_test are already defined
train_models_NaiveBayes(x_train, y_train)

Naive Bayes Accuracy: 0.4945054945054945
Naive Bayes Accuracy: 0.5714285714285714
Naive Bayes Accuracy: 0.5054945054945055
Naive Bayes Accuracy: 0.6222222222222222
Naive Bayes Accuracy: 0.5
Naive Bayes Accuracy: 0.4888888888888889
Naive Bayes Accuracy: 0.5111111111111111
Naive Bayes Accuracy: 0.6444444444444445
Naive Bayes Accuracy: 0.6
Naive Bayes Accuracy: 0.5444444444444444
Average Naive Bayes Accuracy: 0.5482539682539682


In [22]:
def train_models_LR(x, y, n_splits=10):
    kf = KFold(n_splits=n_splits)
    accuracies = []

    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        lr = LogisticRegression()
        lr.fit(x_train, y_train)
        lr_pred = lr.predict(x_test)
        lr_pred_labels = np.where(lr_pred > 0.5, 1, 0)  
        accuracy_lr = np.mean(lr_pred_labels == y_test)
        accuracies.append(accuracy_lr)
        print("Logistic Regression Accuracy:", accuracy_lr)

    average_accuracy = np.mean(accuracies)
    print("Average Logistic Regression Accuracy:", average_accuracy)

# Assuming x_train, y_train, x_test, y_test are already defined
train_models_LR(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.4835164835164835
Logistic Regression Accuracy: 0.5494505494505495
Logistic Regression Accuracy: 0.5164835164835165
Logistic Regression Accuracy: 0.6111111111111112
Logistic Regression Accuracy: 0.5444444444444444


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Accuracy: 0.5
Logistic Regression Accuracy: 0.6


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.6333333333333333
Logistic Regression Accuracy: 0.6222222222222222
Logistic Regression Accuracy: 0.5555555555555556
Average Logistic Regression Accuracy: 0.5616117216117216


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
def train_models_SVM(x, y, n_splits=10):
    kf = KFold(n_splits=n_splits)
    accuracies = []

    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        svm = SVC()
        svm.fit(x_train, y_train)
        svm_pred = svm.predict(x_test)
        svm_pred_labels = np.where(svm_pred > 0.5, 1, 0)  
        accuracy_svm = np.mean(svm_pred_labels == y_test)
        accuracies.append(accuracy_svm)
        print("Support Vector Machine Accuracy:", accuracy_svm)

    average_accuracy = np.mean(accuracies)
    print("Average Support Vector Machine Accuracy:", average_accuracy)

# Assuming x_train, y_train, x_test, y_test are already defined
train_models_SVM(x_train, y_train)


Support Vector Machine Accuracy: 0.6043956043956044
Support Vector Machine Accuracy: 0.5274725274725275
Support Vector Machine Accuracy: 0.5934065934065934
Support Vector Machine Accuracy: 0.6222222222222222
Support Vector Machine Accuracy: 0.5666666666666667
Support Vector Machine Accuracy: 0.5444444444444444
Support Vector Machine Accuracy: 0.6666666666666666
Support Vector Machine Accuracy: 0.6222222222222222
Support Vector Machine Accuracy: 0.5
Support Vector Machine Accuracy: 0.5111111111111111
Average Support Vector Machine Accuracy: 0.5758608058608059


In [24]:
def train_models_KNN(x, y, n_splits=10):
    kf = KFold(n_splits=n_splits)
    accuracies = []

    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        knn = KNeighborsClassifier(n_neighbors=3)
        knn.fit(x_train, y_train)
        knn_pred = knn.predict(x_test)
        knn_pred_labels = np.where(knn_pred > 0.5, 1, 0)  
        accuracy_knn = np.mean(knn_pred_labels == y_test)
        accuracies.append(accuracy_knn)
        print("K-Nearest Neighbors Accuracy:", accuracy_knn)

    average_accuracy = np.mean(accuracies)
    print("Average K-Nearest Neighbors Accuracy:", average_accuracy)

# Assuming x_train, y_train, x_test, y_test are already defined
train_models_KNN(x_train, y_train)



K-Nearest Neighbors Accuracy: 0.5604395604395604
K-Nearest Neighbors Accuracy: 0.4175824175824176
K-Nearest Neighbors Accuracy: 0.6153846153846154
K-Nearest Neighbors Accuracy: 0.5888888888888889
K-Nearest Neighbors Accuracy: 0.4777777777777778
K-Nearest Neighbors Accuracy: 0.5666666666666667
K-Nearest Neighbors Accuracy: 0.5111111111111111
K-Nearest Neighbors Accuracy: 0.5777777777777777
K-Nearest Neighbors Accuracy: 0.4777777777777778
K-Nearest Neighbors Accuracy: 0.5111111111111111
Average K-Nearest Neighbors Accuracy: 0.5304517704517705


In [25]:
def train_models_DT(x, y, n_splits=10):
    kf = KFold(n_splits=n_splits)
    accuracies = []

    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        dt = DecisionTreeClassifier()
        dt.fit(x_train, y_train)
        dt_pred = dt.predict(x_test)
        dt_pred_labels = np.where(dt_pred > 0.5, 1, 0)  
        accuracy_dt = np.mean(dt_pred_labels == y_test)
        accuracies.append(accuracy_dt)
        print("Decision Tree Accuracy:", accuracy_dt)

    average_accuracy = np.mean(accuracies)
    print("Average Decision Tree Accuracy:", average_accuracy)

# Assuming x_train, y_train, x_test, y_test are already defined
train_models_DT(x_train, y_train)

Decision Tree Accuracy: 0.6813186813186813
Decision Tree Accuracy: 0.6263736263736264
Decision Tree Accuracy: 0.6373626373626373
Decision Tree Accuracy: 0.7
Decision Tree Accuracy: 0.6555555555555556
Decision Tree Accuracy: 0.5777777777777777
Decision Tree Accuracy: 0.6555555555555556
Decision Tree Accuracy: 0.7222222222222222
Decision Tree Accuracy: 0.7111111111111111
Decision Tree Accuracy: 0.6222222222222222
Average Decision Tree Accuracy: 0.6589499389499389


In [26]:
def train_models_RF(x, y, n_splits=10):
    kf = KFold(n_splits=n_splits)
    accuracies = []

    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        rf = RandomForestClassifier(n_estimators=10)
        rf.fit(x_train, y_train)
        rf_pred = rf.predict(x_test)
        rf_pred_labels = np.where(rf_pred > 0.5, 1, 0)  
        accuracy_rf = np.mean(rf_pred_labels == y_test)
        accuracies.append(accuracy_rf)
        print("Random Forest Accuracy:", accuracy_rf) 
               
    average_accuracy = np.mean(accuracies)
    print("Average Random Forest Accuracy:", average_accuracy)

# Assuming x_train, y_train, x_test, y_test are already defined
train_models_RF(x_train, y_train)

Random Forest Accuracy: 0.7032967032967034
Random Forest Accuracy: 0.5274725274725275
Random Forest Accuracy: 0.6483516483516484
Random Forest Accuracy: 0.6444444444444445
Random Forest Accuracy: 0.6333333333333333
Random Forest Accuracy: 0.6111111111111112
Random Forest Accuracy: 0.6333333333333333
Random Forest Accuracy: 0.6777777777777778
Random Forest Accuracy: 0.5666666666666667
Random Forest Accuracy: 0.5666666666666667
Average Random Forest Accuracy: 0.6212454212454211


In [28]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import KFold

def train_models_LSTM(x, y, n_splits=10):
    kf = KFold(n_splits=n_splits)
    accuracies = []

    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # 构建 LSTM 模型
        model = Sequential()
        model.add(LSTM(units=64, input_shape=(x_train.shape[1], 1)))  # 输入数据的形状需要适应您的数据
        model.add(Dense(units=1, activation='sigmoid'))  # 二进制分类问题的输出层

        # 编译模型
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        # 训练模型
        model.fit(x_train, y_train, epochs=10, batch_size=32, verbose=0)  # 根据需要更改参数

        # 在测试数据上评估模型
        _, accuracy = model.evaluate(x_test, y_test, verbose=0)
        accuracies.append(accuracy)
        print("LSTM Accuracy:", accuracy)

    average_accuracy = np.mean(accuracies)
    print("Average LSTM Accuracy:", average_accuracy)

train_models_LSTM(x_train, y_train)

LSTM Accuracy: 0.692307710647583
LSTM Accuracy: 0.6813187003135681
LSTM Accuracy: 0.7142857313156128
LSTM Accuracy: 0.7111111283302307
LSTM Accuracy: 0.6111111044883728
LSTM Accuracy: 0.6000000238418579
LSTM Accuracy: 0.6666666865348816
LSTM Accuracy: 0.7555555701255798
LSTM Accuracy: 0.6111111044883728
LSTM Accuracy: 0.6777777671813965
Average LSTM Accuracy: 0.6721245527267456


### 补充 DT算法参数调优前的训练效果

In [46]:
from sklearn.model_selection import GridSearchCV

def train_model_DT_with_grid_search(x, y, n_splits=10):
    kf = KFold(n_splits=n_splits)
    accuracies = []

    params = {
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    }

    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        dt = DecisionTreeClassifier()
        grid_search = GridSearchCV(estimator=dt, param_grid=params, cv=5, scoring='accuracy')
        grid_search.fit(x_train, y_train)

        best_dt = grid_search.best_estimator_
        dt_pred = best_dt.predict(x_train)
        dt_pred_labels = np.where(dt_pred > 0.5, 1, 0)  
        accuracy_dt = np.mean(dt_pred_labels == y_train)
        accuracies.append(accuracy_dt)
        print("Decision Tree Accuracy:", accuracy_dt)

    average_accuracy = np.mean(accuracies)
    print("Average Decision Tree Accuracy:", average_accuracy)

# Assuming x_train, y_train, x_test, y_test are already defined
train_model_DT_with_grid_search(x_train, y_train)

Decision Tree Accuracy: 0.9027093596059114
Decision Tree Accuracy: 0.9310344827586207
Decision Tree Accuracy: 0.8004926108374384
Decision Tree Accuracy: 0.8031980319803198
Decision Tree Accuracy: 0.7995079950799509
Decision Tree Accuracy: 0.7847478474784748
Decision Tree Accuracy: 0.8068880688806888
Decision Tree Accuracy: 0.9384993849938499
Decision Tree Accuracy: 0.931119311193112
Decision Tree Accuracy: 0.8031980319803198
Average Decision Tree Accuracy: 0.8501395124788687


### 4. 对DT算法进行改进

### 4.1 参数调优后的预测效果

In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def parameter_search(max_depth_range, min_samples_split_range, min_samples_leaf_range, x_train, y_train, x_test, y_test):
    best_accuracy = -np.inf
    best_params = {}

    for max_depth in max_depth_range:
        print(f"max_depth={max_depth}...")

        for min_samples_split in min_samples_split_range:
            print(f"min_samples_split={min_samples_split}...")

            for min_samples_leaf in min_samples_leaf_range:
                print(f"min_samples_leaf={min_samples_leaf}...")

                dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
                dt.fit(x_train, y_train)
                dt_pred = dt.predict(x_test)
                #dt_pred_labels = np.argmax(dt_pred, axis=1)
                accuracy = np.mean(dt_pred == np.argmax(y_test, axis=1))

                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['max_depth'] = max_depth
                    best_params['min_samples_split'] = min_samples_split
                    best_params['min_samples_leaf'] = min_samples_leaf

    return best_params, best_accuracy

max_depth_range = [2,3,4,5,6,7,8,9,10]
min_samples_split_range = [2,3,4,5,6,7,8,9,10]
min_samples_leaf_range = [2,3,4,5,6,7,8,9,10]

best_params, best_accuracy = parameter_search(max_depth_range, min_samples_split_range, min_samples_leaf_range, x_train, y_train, x_test, y_test)
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

max_depth=2...
min_samples_split=2...
min_samples_leaf=2...
min_samples_leaf=3...
min_samples_leaf=4...
min_samples_leaf=5...
min_samples_leaf=6...
min_samples_leaf=7...
min_samples_leaf=8...
min_samples_leaf=9...
min_samples_leaf=10...
min_samples_split=3...
min_samples_leaf=2...
min_samples_leaf=3...
min_samples_leaf=4...
min_samples_leaf=5...
min_samples_leaf=6...
min_samples_leaf=7...
min_samples_leaf=8...
min_samples_leaf=9...
min_samples_leaf=10...
min_samples_split=4...
min_samples_leaf=2...
min_samples_leaf=3...
min_samples_leaf=4...
min_samples_leaf=5...
min_samples_leaf=6...
min_samples_leaf=7...
min_samples_leaf=8...
min_samples_leaf=9...
min_samples_leaf=10...
min_samples_split=5...
min_samples_leaf=2...
min_samples_leaf=3...
min_samples_leaf=4...
min_samples_leaf=5...
min_samples_leaf=6...
min_samples_leaf=7...
min_samples_leaf=8...
min_samples_leaf=9...
min_samples_leaf=10...
min_samples_split=6...
min_samples_leaf=2...
min_samples_leaf=3...
min_samples_leaf=4...
min_samp

### 4.2 参数调优后的训练效果

In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def parameter_search(max_depth_range, min_samples_split_range, min_samples_leaf_range, x_train, y_train):
    best_accuracy = -np.inf
    best_params = {}

    for max_depth in max_depth_range:
        print(f"max_depth={max_depth}...")

        for min_samples_split in min_samples_split_range:
            print(f"min_samples_split={min_samples_split}...")

            for min_samples_leaf in min_samples_leaf_range:
                print(f"min_samples_leaf={min_samples_leaf}...")

                dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
                dt.fit(x_train, y_train)
                dt_pred = dt.predict(x_train)
                #dt_pred_labels = np.argmax(dt_pred, axis=1)
                accuracy = np.mean(dt_pred == y_train)

                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params['max_depth'] = max_depth
                    best_params['min_samples_split'] = min_samples_split
                    best_params['min_samples_leaf'] = min_samples_leaf

    return best_params, best_accuracy

    
max_depth_range = [2,3,4,5,6,7,8,9,10]
min_samples_split_range = [2,3,4,5,6,7,8,9,10]
min_samples_leaf_range = [2,3,4,5,6,7,8,9,10]

best_params, best_accuracy = parameter_search(max_depth_range, min_samples_split_range, min_samples_leaf_range, x_train, y_train)
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

max_depth=2...
min_samples_split=2...
min_samples_leaf=2...
min_samples_leaf=3...
min_samples_leaf=4...
min_samples_leaf=5...
min_samples_leaf=6...
min_samples_leaf=7...
min_samples_leaf=8...
min_samples_leaf=9...
min_samples_leaf=10...
min_samples_split=3...
min_samples_leaf=2...
min_samples_leaf=3...
min_samples_leaf=4...
min_samples_leaf=5...
min_samples_leaf=6...
min_samples_leaf=7...
min_samples_leaf=8...
min_samples_leaf=9...
min_samples_leaf=10...
min_samples_split=4...
min_samples_leaf=2...
min_samples_leaf=3...
min_samples_leaf=4...
min_samples_leaf=5...
min_samples_leaf=6...
min_samples_leaf=7...
min_samples_leaf=8...
min_samples_leaf=9...
min_samples_leaf=10...
min_samples_split=5...
min_samples_leaf=2...
min_samples_leaf=3...
min_samples_leaf=4...
min_samples_leaf=5...
min_samples_leaf=6...
min_samples_leaf=7...
min_samples_leaf=8...
min_samples_leaf=9...
min_samples_leaf=10...
min_samples_split=6...
min_samples_leaf=2...
min_samples_leaf=3...
min_samples_leaf=4...
min_samp