In [7]:
from sklearn.model_selection import train_test_split

def load_one_file(filename):
    with open(filename, encoding='utf8') as f:
        return [str(i.strip()) for i in f.readlines()]  # 确保每一行都被转换为字符串

def load_all_files():
    x_train = []
    y_train = []
    x_test = []
    y_test = []

 
    # Load and label the data
    x_normal = load_one_file('data/normal.txt')
    x_weak = load_one_file('data/weak.txt')
    x_token = load_one_file('data/tokens.txt')

    dataNum = min(len(x_normal), len(x_weak), len(x_token))

    y_normal = [0] * dataNum 
    y_weak = [1] * dataNum 
    y_token = [2] * dataNum


    # Combine all data
    x = x_normal[:dataNum] + x_weak[:dataNum] + x_token[:dataNum]
    y = y_normal + y_weak + y_token

    # Split the data into training and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = load_all_files()
print(f"Training data size: {len(x_train)}")
print(f"Test data size: {len(x_test)}")

Training data size: 4101
Test data size: 1026


In [8]:
import numpy as np
import pandas as pd

df = pd.DataFrame({'x_train':x_train, 
              'y_train':y_train,
              'x_test':x_test+['']*int(len(x_train)-len(x_test)),
              'y_test':y_test+[0]*int(len(x_train)-len(x_test)) })

In [9]:
df

Unnamed: 0,x_train,y_train,x_test,y_test
0,"Oberman,",0,UqZBpD3n3iTIBgBW=v1-1r6g8WcQKe,2
1,She's,0,Hm_lpvt_00108d8049c69b90aefda9703dee7b4e=17394...,2
2,JSESSIONID_JAVA=517DC05AE1AE460D92FF96149D251E9B,2,cc67a8d4abbc6ab7c6d899f8ed6892fe=a790ec26c6a04...,2
3,Hm_lvt_2024f59541ef1c59b5e53289ffdaaebb=173923...,2,wants,0
4,etihad.10,1,Hm_lpvt_50be06609cf92e6d07fd1e3a8376dde0=17394...,2
...,...,...,...,...
4096,HMACCOUNT=F521520514609695,2,,0
4097,down,0,,0
4098,paperclip,1,,0
4099,Hm_lpvt_2024f59541ef1c59b5e53289ffdaaebb=17394...,2,,0


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
max_features=500

vectorizer = CountVectorizer(
                                decode_error='ignore',
                                strip_accents='ascii',
                                max_features=max_features,
                                max_df=1.0,
                                min_df=1 )
print(vectorizer)
x_train=vectorizer.fit_transform(x_train)
x_train=x_train.toarray()

CountVectorizer(decode_error='ignore', max_features=500, strip_accents='ascii')


In [11]:
vocabulary=vectorizer.vocabulary_
# 复用vocabulary对测试数据集进行词袋化处理
vectorizer = CountVectorizer(
                                decode_error='ignore',
                                strip_accents='ascii',
                                vocabulary=vocabulary,
                                max_df=1.0,
                                min_df=1 )
print(vectorizer)
x_test=vectorizer.fit_transform(x_test)
x_test=x_test.toarray()

CountVectorizer(decode_error='ignore', strip_accents='ascii',
                vocabulary={'00cfe77092104c99': 0, '021f2a73d5af': 1,
                            '035559e0': 2, '0653d1d7e9c2': 3, '0bde2425f512': 4,
                            '0c2d464c622b4f9b9a81eb89b45061ae': 5,
                            '0dc6d096': 6, '10a07a9ab7e6': 7, '123': 8,
                            '14b78bb6dff8': 9, '15f8': 10, '1602': 11,
                            '1661877752': 12, '172421156': 13, '1736392260': 14,
                            '1737338170': 15, '1738651153': 16,
                            '1738984680': 17, '1738985495': 18,
                            '1738985539': 19, '1739155025': 20,
                            '1739164319': 21, '1739169338': 22,
                            '1739173573': 23, '1739173604': 24,
                            '1739176775': 25, '1739240620': 26,
                            '1739257204': 27, '1739260420': 28,
                            '1739265925': 29, ..

In [12]:
df = pd.DataFrame(x_train) # 每一行代表一个文档，每一列代表一个词汇，矩阵中的值表示词汇在文档中出现的次数。
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4096,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4097,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4098,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4099,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
from sklearn.linear_model import LogisticRegression

# Initialize the classifier
classifier = LogisticRegression(max_iter=1000)

# Train the classifier
classifier.fit(x_train, y_train)

LogisticRegression(max_iter=1000)

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict the labels for the test set
y_pred = classifier.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Print classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.65
Confusion Matrix:
[[ 31 325   2]
 [  1 312   0]
 [  0  26 329]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.09      0.16       358
           1       0.47      1.00      0.64       313
           2       0.99      0.93      0.96       355

    accuracy                           0.65      1026
   macro avg       0.81      0.67      0.59      1026
weighted avg       0.83      0.65      0.58      1026



In [15]:
import joblib

# 保存模型和向量化器
joblib.dump(classifier, 'logistic_regression_model.pkl')
joblib.dump(vectorizer, 'count_vectorizer.pkl')

['count_vectorizer.pkl']

In [16]:
from unittest import result
import joblib

def load_model_and_vectorizer(model_path, vectorizer_path):
    """
    加载训练好的模型和向量化器
    """
    model = joblib.load(model_path)
    vectorizer = joblib.load(vectorizer_path)
    return model, vectorizer

def predict_text(text, model, vectorizer):
    """
    使用加载的模型和向量化器对输入文本进行预测
    """
    # 将输入文本转换为特征向量
    text_counts = vectorizer.transform([text])
    
    # 进行预测
    prediction = model.predict(text_counts)
    
    return prediction[0]

# 加载模型和向量化器
model, vectorizer = load_model_and_vectorizer('logistic_regression_model.pkl', 'count_vectorizer.pkl')

# 示例预测
input_text = "1"
prediction = predict_text(input_text, model, vectorizer)

result = ['normal/正常', 'weak/弱口令', 'tokens/令牌']
print(f"预测出来的结果是: {result[prediction]}")

预测出来的结果是: weak/弱口令
