# 聚类网页内容，用于推荐系统

## 读取mysql数据库，获取网页

In [1]:
import pymysql
cnx = pymysql.connect(host='localhost', user='root', password='123qwe12')
cursor = cnx.cursor()
cnx.select_db('IR_db')

In [2]:
def get_data_from_table(table_name):
    cursor.execute(f"SELECT id, title, content FROM {table_name}")
    results = cursor.fetchall()
    data = []
    for row in results:
        data.append({
            'id': row[0],
            'title': row[1],
            'content': row[2],
            'type': 'douban'
        })
    return data


def get_ids_from_same_title():
    # 创建SQL查询
    sql = "SELECT page_id FROM same_title"
    # 执行查询
    cursor.execute(sql)
    # 获取所有的结果
    results = cursor.fetchall()
    # 将结果从元组列表转换为普通列表
    ids_same_title = [result[0] for result in results]
    return ids_same_title


# 有问题
def get_page():
    cursor.execute("SELECT id, title, content FROM page")
    results = cursor.fetchall()
    # 获取相同title的id
    ids_same_title = get_ids_from_same_title()
    data = []
    for row in results:
        data.append({
            'id': row[0],
            'title': row[1],
            'content': row[2],
            'type': 'page'
        })
    # 从same_title中按title分组，获取每组的第一个id
    sql = "SELECT MIN(page_id) FROM same_title GROUP BY title"
    cursor.execute(sql)
    results = cursor.fetchall()
    ids = [result[0] for result in results]
    print(len(ids))
    # 找到这些id对应的url
    for id in ids:
        sql = "SELECT id, title, content FROM page WHERE id = %d" % id
        cursor.execute(sql)
        results = cursor.fetchall()
        data.append({
                'id': row[0],
                'title': row[1],
                'content': row[2],
                'type': 'page'
            })
    return data


In [3]:
data = get_data_from_table('douban')
data += get_data_from_table('html')
data += get_data_from_table('page')
print(len(data))

16151


## 对网页内容进行聚类

In [4]:
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# from sklearn.decomposition import PCA
import numpy as np
import random

In [35]:

def random_substring(s, length=50):
    if len(s) <= length:
        return s
    start = random.randint(0, len(s) - length)
    return s[start:start+length]

with open('cn_stopwords.txt', 'r', encoding='utf-8') as f:
    stop_words = [line.strip() for line in f.readlines()]
stop_words += ["的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "他", "这", "中", "大", "以", "到", "说", "等", "能", "也", "上", "或", "之", "但", "个", "都", "而", "啊", "把", "那", "你", "一", "为", "所", "年", "没", "着", "要", "与"]
stop_words = list(set(stop_words))

# 使用jieba进行分词
contents = [' '.join(word for word in jieba.cut(item['title']+random_substring(item['content'])) 
                     if word not in stop_words) for item in data]
print('cut word finished')

# 使用TF-IDF模型将文本转换为数值向量
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(contents)
print('tf-idf finished')
print(X.shape)

# # 使用PCA进行降维
# pca = PCA(n_components=10000)
# X_pca = pca.fit_transform(X.toarray())  # 转换为数组并降维
# print('PCA finished')

# 使用KMeans进行聚类
kmeans = KMeans(n_clusters=24)
kmeans.fit(X)
print('kmeans finished')

# 输出每个数据点的聚类标签
labels = kmeans.labels_
# 统计每个类的个数
counts = np.bincount(labels)
for i, count in enumerate(counts):
    print(f"Cluster {i} has {count} data points")

cut word finished
tf-idf finished
(16151, 49947)
kmeans finished
Cluster 0 has 5425 data points
Cluster 1 has 951 data points
Cluster 2 has 1927 data points
Cluster 3 has 111 data points
Cluster 4 has 340 data points
Cluster 5 has 280 data points
Cluster 6 has 221 data points
Cluster 7 has 129 data points
Cluster 8 has 975 data points
Cluster 9 has 170 data points
Cluster 10 has 78 data points
Cluster 11 has 297 data points
Cluster 12 has 319 data points
Cluster 13 has 615 data points
Cluster 14 has 2513 data points
Cluster 15 has 201 data points
Cluster 16 has 112 data points
Cluster 17 has 144 data points
Cluster 18 has 44 data points
Cluster 19 has 367 data points
Cluster 20 has 200 data points
Cluster 21 has 195 data points
Cluster 22 has 355 data points
Cluster 23 has 182 data points


## 保存聚类结果

In [42]:
cursor.execute("""
    DROP TABLE IF EXISTS cluster
""")
cnx.commit()

In [43]:
# 创建表
cursor.execute("""
    CREATE TABLE IF NOT EXISTS cluster (
        id INT AUTO_INCREMENT PRIMARY KEY,
        title VARCHAR(255),
        type VARCHAR(255),
        web_id INT,
        label INT
    )
""")
# labels是KMeans的结果，vectors是TF-IDF向量
for item, label in zip(data, labels):
    # 插入数据
    cursor.execute("""
        INSERT INTO cluster (title, type, web_id, label)
        VALUES (%s, %s, %s, %s)
    """, (item['title'], item['type'], item['id'], int(label)))

# 提交
cnx.commit()

In [44]:
import joblib
# 保存模型
joblib.dump(kmeans, 'kmeans_model.pkl')
joblib.dump(vectorizer, 'tfidf_model.pkl')

['tfidf_model.pkl']

## 提取模型进行预测

In [45]:
import joblib
kmeans = joblib.load('kmeans_model.pkl')
vectorizer = joblib.load('tfidf_model.pkl')

In [46]:
query = '霸王别姬'
# 使用jieba进行分词
query_cut = ' '.join(jieba.cut(query))
# 使用TF-IDF模型将文本转换为数值向量
query_vec = vectorizer.transform([query_cut])
# 使用KMeans进行预测
query_label = kmeans.predict(query_vec)
print(f"The query is in cluster {query_label[0]}")

The query is in cluster 0


### 获取对应类别下所有记录

In [47]:
# 查询数据库
cursor.execute("""
    SELECT web_id, title
    FROM cluster
    WHERE label = %s
""", (query_label[0],))

# 获取查询结果
results = cursor.fetchall()

print(len(results))

# # 打印结果
# for row in results:
#     print(f"web_id: {row[0]}, title: {row[1]}")

5425


In [48]:
from sklearn.metrics.pairwise import cosine_similarity
import jieba.analyse

# 获取所有的title
titles = [row[1] for row in results]

# 使用jieba进行分词
titles = [' '.join(jieba.cut(title)) for title in titles]

# 使用TF-IDF模型将query和titles转换为向量
X = vectorizer.transform([query_cut] + titles)

# 计算query和每个title的余弦相似度
similarities = cosine_similarity(X[0:1], X[1:]).flatten()

# 获取最相似的5个title的索引
top5_indices = similarities.argsort()[-5:]

# 打印最相似的5个title
for index in top5_indices:
    title = results[index][1]
    print(f"web_id: {results[index][0]}, title: {title}, similarity: {similarities[index]}")

web_id: 1372, title: 迷恋与背叛——[霸王别姬], similarity: 0.5751382418948486
web_id: 1373, title: 胡说霸王别姬, similarity: 0.6384269871485116
web_id: 1376, title: 从另一角度看《霸王别姬》, similarity: 0.698602876133242
web_id: 1378, title: 张国荣评《霸王别姬》, similarity: 0.710826486655719
web_id: 4447, title: 霸王别姬, similarity: 1.0
