In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import joblib
import tensorflow as tf
from tensorflow import keras
from keras.layers import Input, Embedding, Reshape, Concatenate, Dense, Dropout
from keras.models import Model
from sklearn.metrics import mean_squared_error
from scipy.sparse import vstack
from scipy.sparse import csr_matrix





In [33]:

# --- 数据加载和预处理 ---
df = pd.read_csv("clean_data.csv")
# 数值特征分箱 (只保留 condition 和 mmr)
# 数值特征分箱 (只保留 condition)
numerical_features = ['condition']  # MMR 不再分箱
for feature in numerical_features:
    df[feature] = pd.qcut(df[feature], q=5, labels=False, duplicates='drop').astype(str)

# 字符串特征处理
string_cols = df.select_dtypes(include='object').columns
df[string_cols] = df[string_cols].fillna("Unknown")


# price_odometer_scaler = StandardScaler()
# df[['sellingprice', 'odometer']] = price_odometer_scaler.fit_transform(df[['sellingprice', 'odometer']])



# --- 特征列表 ---
features = ['make', 'model', 'trim', 'body', 'transmission', 'state', 'condition', 'color', 'interior', 'mmr']

# --- 词嵌入 ---
feature_combinations = df[features].apply(lambda row: row.values.astype(str).tolist(), axis=1).tolist()
model_w2v = Word2Vec(feature_combinations, vector_size=100, window=5, min_count=1, workers=4)

# --- MMR 缩放 ---  (提前到这里)
mmr_scaler = MinMaxScaler()
df['mmr_scaled'] = mmr_scaler.fit_transform(df[['mmr']])

# --- 构建包含 mmr_scaled 的特征向量 ---  (使用缩放后的 MMR)
car_vectors_with_mmr = df.apply(
    lambda car: np.concatenate([
        np.mean(
            [model_w2v.wv[feature] for feature in car[features].values.astype(str) if feature in model_w2v.wv and feature != 'mmr'] or [np.zeros(100)],
            axis=0
        ),
        np.array([car['mmr_scaled']])  # 使用缩放后的 MMR
    ]),
    axis=1).tolist()


# --- 重新训练 scaler --- (使用包含 MMR_scaled 的向量)
scaler = StandardScaler().fit(car_vectors_with_mmr)
car_vectors_with_mmr = scaler.transform(car_vectors_with_mmr)




In [41]:
# --- K-means 聚类 --- (使用缩放后的 car_vectors_with_mmr)
n_clusters = 50
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(car_vectors_with_mmr)  # 使用 car_vectors_with_mmr
df['cluster'] = kmeans.labels_




# --- 基于内容的推荐 ---
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dense
def content_based_recommendation(user_input, all_car_vectors, top_n=5, min_price=-np.inf, max_price=np.inf, min_odo=-np.inf, max_odo=np.inf):
    """基于内容的推荐，根据用户输入的特征、价格和里程数范围，返回最相似的车辆。"""
    user_input_df = pd.DataFrame([user_input])

    # 过滤符合条件的车辆
    filtered_df = df[
        (df['sellingprice'] >= min_price) &
        (df['sellingprice'] <= max_price) &
        (df['odometer'] >= min_odo) &
        (df['odometer'] <= max_odo)
    ]

    # 计算用户特征向量
    user_vector = np.mean(
        [model_w2v.wv[feature] if feature != 'mmr' else np.array([user_input['mmr']]) for feature in user_input_df[features].values.astype(str)[0] if (feature in model_w2v.wv or feature == 'mmr')], # 获取所有特征，包括MMR
        axis=0 #计算均值
    )
    user_vector = np.concatenate([user_vector,np.array([user_input['mmr_scaled']])])
    user_vector = scaler.transform([user_vector]) #scaler要用101维的

    if len(filtered_df) == 0:
        print("没有找到符合条件的车辆。")
        return pd.DataFrame(columns=df.columns)  # 返回空的 DataFrame

    #  使用预先计算好的特征向量，避免重复计算
    filtered_car_vectors = all_car_vectors[filtered_df.index]

    similarities = cosine_similarity(user_vector, filtered_car_vectors)[0]
    similar_car_indices = np.argsort(similarities)[::-1][:top_n]
    return filtered_df.iloc[similar_car_indices]

In [37]:

def matrix_factorization_recommendation(user_ratings, initial_recommendations, df, car_vectors, scaler, model_w2v, n_clusters, kmeans, features):
    user_id = 0  # 假设只有一个用户
    user_car_array = np.array([])
    ratings_df = pd.DataFrame({'userId': [user_id] * len(user_ratings),
                                 'carId': initial_recommendations.index,
                                 'rating': user_ratings})


    try:
        existing_ratings = pd.read_csv("ratings.csv")
    except FileNotFoundError:
        existing_ratings = pd.DataFrame(columns=['userId', 'carId', 'rating'])

    df_ratings = pd.concat([existing_ratings, ratings_df], ignore_index=True)

    df_p = df_ratings.pivot_table(index='userId', columns='carId', values='rating')
    mean_rating = df_ratings['rating'].mean()
    df_ratings['rating'] = df_ratings['rating'].fillna(mean_rating)

    df_title = df_ratings
    user_ids = df_title["userId"].unique().tolist()
    user2user_encoded = {x: i for i, x in enumerate(user_ids)}
    userencoded2user = {i: x for i, x in enumerate(user_ids)}
    car_ids = df_title["carId"].unique().tolist()
    car2car_encoded = {x: i for i, x in enumerate(car_ids)}
    car_encoded2car = {i: x for i, x in enumerate(car_ids)}
    df_title["user"] = df_title["userId"].map(user2user_encoded)
    df_title["car"] = df_title["carId"].map(car2car_encoded)

    num_users = len(user2user_encoded)
    num_cars = len(car_encoded2car)
    df_title["rating"] = df_title["rating"].values.astype(np.float32)

    min_rating = min(df_title["rating"])
    max_rating = max(df_title["rating"])

    df_title = df_title.sample(frac=1, random_state=42)
    x = df_title[["user", "car"]].values
    y = df_title["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

    train_indices = int(0.9 * df.shape[0])
    x_train, x_val, y_train, y_val = (
        x[:train_indices],
        x[train_indices:],
        y[:train_indices],
        y[train_indices:],
    )

    embedding_size = 10

    user_id_input = Input(shape=[1], name='user')
    car_id_input = Input(shape=[1], name='car')

    user_embedding = Embedding(output_dim=embedding_size,
                                input_dim=num_users,
                                embeddings_initializer="he_normal",
                                embeddings_regularizer=keras.regularizers.l2(1e-6),
                                name='user_embedding')(user_id_input)

    car_embedding = Embedding(output_dim=embedding_size,
                                input_dim=num_cars,
                                embeddings_initializer="he_normal",
                                embeddings_regularizer=keras.regularizers.l2(1e-6),
                                name='car_embedding')(car_id_input)

    user_vector = Reshape([embedding_size])(user_embedding)
    car_vector = Reshape([embedding_size])(car_embedding)

    concat = Concatenate()([user_vector, car_vector])
    dense1 = Dense(64, kernel_regularizer=l2(0.01))(concat)
    dense = Dropout(0.2)(dense1)
    y = Dense(1, activation="sigmoid")(dense)

    model = Model(inputs=[user_id_input, car_id_input], outputs=y)
    model.compile(loss='mse', optimizer='adam')

    model.fit(x=[x_train[:, 0], x_train[:, 1]], y=y_train, batch_size=4, epochs=22, verbose=0)

    cars_not_watched = df[~df["carId"].isin(df_ratings[df_ratings['userId'] == user_id].carId.values)]["carId"]
    cars_not_watched = list(set(cars_not_watched).intersection(set(car2car_encoded.keys())))
    cars_not_watched = [[car2car_encoded.get(x)] for x in cars_not_watched]
    user_encoder = user2user_encoded.get(user_id)
    # 仅当 cars_not_watched 不为空时才构建 user_car_array
    if cars_not_watched:
        user_car_array = np.hstack(([[user_encoder]] * len(cars_not_watched), cars_not_watched))

    # 检查 user_car_array 是否为空 (现在始终已赋值)
    if user_car_array.size == 0:
        print("没有未观看的车辆可供推荐。 使用默认值。")
        # 返回默认值
        try: #尝试加载已保存的模型
            model = keras.models.load_model('matrix_factorization_model.h5')
            #如果模型文件存在，则返回加载的模型和其他默认值
            return model, {}, {}, df_ratings['rating'].mean()
        except:
            #如果没有保存的模型，则创建一个新的模型并返回，以及其他默认值
            # ...（构建模型的代码，与函数中其他部分相同）
            return model, {}, {}, df_ratings['rating'].mean()

    # 检查 user_car_array 的维度并进行 reshape
    if user_car_array.ndim == 1:
        user_car_array = user_car_array.reshape(1, -1)
    elif user_car_array.shape[1] == 1:  # 检查是否只有一列
        user_car_array = user_car_array.reshape(-1, 1)

    # 根据 user_car_array 的形状构建模型输入
    if user_car_array.shape[1] == 1:  # 只有一列，可能是只有 user_id
        ratings = model.predict([user_car_array[:, 0]]).flatten() #这里也要修改
    else: #有两列，user_id 和 car_id
        ratings = model.predict([user_car_array[:, 0], user_car_array[:, 1]]).flatten()


    top_ratings_indices = ratings.argsort()[-10:][::-1]
    recommended_car_ids = [car_encoded2car.get(cars_not_watched[x][0]) for x in top_ratings_indices]

    # 修改输出部分：
    print("Showing recommendations for user: {}".format(user_id))
    print("=" * 36)
    print("-" * 30)
    print(" Top 10 car recommendations")
    print("-" * 30)

    recommended_cars = df[df["carId"].isin(recommended_car_ids)]

    # 选择要打印的列，包括原始的数值列
    recommended_cars = df[df["carId"].isin(recommended_car_ids)]

    columns_to_print = ['make', 'model', 'trim', 'body', 'transmission', 'state', 'color', 'interior'] + numerical_features  # 使用原始数值列名
    print(recommended_cars[columns_to_print].to_string(index=False))
    return model, user2user_encoded, car2car_encoded, df_ratings['rating'].mean()

# def simulate_user_ratings(df, num_users=50, ratings_per_user=10):
#     """模拟用户评分数据。"""
#     all_car_ids = df['carId'].unique()
#     ratings = []

#     for user_id in range(num_users):
#         rated_car_ids = np.random.choice(all_car_ids, size=ratings_per_user, replace=True)
#         for car_id in rated_car_ids:
#             rating = np.random.randint(1, 6)
#             ratings.append([user_id, car_id, rating])

#     return pd.DataFrame(ratings, columns=['userId', 'carId', 'rating'])
def simulate_user_ratings(df, car_vectors, num_users=50, ratings_per_user=10):
    """模拟用户评分数据，基于物品相似度进行预测，使用稀疏矩阵提高效率。"""
    all_car_ids = df['carId'].unique()
    num_cars = len(all_car_ids)

    # 预先计算整个相似度矩阵 (稀疏矩阵)
    similarity_matrix = cosine_similarity(car_vectors, dense_output=False)  # 直接生成稀疏矩阵

    ratings = []
    for user_id in range(num_users):
        rated_car_ids = np.random.choice(all_car_ids, size=ratings_per_user, replace=False)
        rated_car_indices = [np.where(all_car_ids == car_id)[0][0] for car_id in rated_car_ids] #简化索引查找
        rated_car_ratings = np.random.randint(1, 6, size=ratings_per_user)


        for car_index in range(num_cars):
            if car_index in rated_car_indices:
                continue  # 跳过已经评分的车辆

            # 保持稀疏矩阵运算，避免转换为密集矩阵
            similarity_scores = similarity_matrix[rated_car_indices, car_index].toarray().flatten()  # 这里仍然需要flatten
            weighted_ratings = similarity_scores * rated_car_ratings

            # 处理空相似度的情况
            if np.sum(similarity_scores) != 0:
                predicted_rating = np.sum(weighted_ratings) / np.sum(similarity_scores)
            else:  # 如果没有相似车辆，则用平均值或其他策略
                predicted_rating = np.mean(rated_car_ratings) if len(rated_car_ratings) > 0 else 3  # 默认值3


            ratings.append([user_id, all_car_ids[car_index], predicted_rating])

    return pd.DataFrame(ratings, columns=['userId', 'carId', 'rating'])


# --- 获取用户评分 ---
def get_user_ratings(recommendations):
    """模拟用户评分的函数，可以让用户为推荐的车辆打分"""
    print("请为以下推荐车辆打分（1-5）：")
    user_ratings = []
    for index, row in recommendations.iterrows():
        # 注意：这里不再需要显示 'sellingprice' 和 'odometer' 的信息
        rating = input(f"{row['make']} {row['model']} {row['trim']} ({row['year']}): ")
        user_ratings.append(int(rating))
    return user_ratings


In [43]:
def predict_rating(model, user_id, car_id, user2user_encoded, car2car_encoded, mean_rating):
    """使用矩阵分解模型预测用户对车辆的评分."""
    user_encoded = user2user_encoded.get(user_id)
    car_encoded = car2car_encoded.get(car_id)
    if user_encoded is not None and car_encoded is not None:
        rating = model.predict([np.array([user_encoded]), np.array([car_encoded])])
        return rating[0][0]
    else:
        return mean_rating  # 使用平均评分作为默认值

def hybrid_recommendation(user_id, content_recs, matrix_model, user2user_encoded, car2car_encoded, mean_rating, top_n=10, car_vectors=car_vectors, scaler=scaler, features=features, user_input=user_input, model_w2v=model_w2v):  # 添加 car_vectors, scaler 等作为默认参数
    """结合基于内容的推荐和矩阵分解推荐。"""

    # 使用正确的连接方式计算 content_score，并处理缺少 MMR_scaled 的情况
    try:
        user_vector = np.mean([model_w2v.wv[feature] if feature != 'mmr' else np.array([user_input['mmr']]) for feature in content_recs[features].iloc[0].values.astype(str) if (feature in model_w2v.wv or feature == 'mmr')], axis=0)
        user_vector = np.concatenate([user_vector, np.array([user_input['mmr_scaled']])])
        user_vector = scaler.transform([user_vector])

        content_recs['content_score'] = content_recs.apply(
            lambda row: cosine_similarity(user_vector, [np.concatenate([car_vectors[row.name], np.array([row['mmr_scaled']])])])[0][0] if 'mmr_scaled' in row else 0,  # 处理缺少 'mmr_scaled' 的情况
            axis=1
        )
    except Exception as e:  # 捕获任何潜在错误
        print(f"计算 content_score 时出错: {e}")
        content_recs['content_score'] = 0  # 如果出现错误，设置默认值

    # 为基于内容的推荐预测矩阵分解得分
    content_recs['matrix_score'] = content_recs.apply(
        lambda row: predict_rating(matrix_model, user_id, row.name, user2user_encoded, car2car_encoded, mean_rating),
        axis=1
    )

    content_recs['combined_score'] = 0.3 * content_recs['content_score'] + 0.7 * content_recs['matrix_score']

    return content_recs.sort_values(by='combined_score', ascending=False).head(top_n)


In [39]:
print("df shape:", df.shape)
print("car_vectors shape:", np.array(car_vectors).shape)

df shape: (550199, 20)
car_vectors shape: (550199, 100)


In [46]:

user_input = {
        'make': input("请输入品牌: "),
        'model': input("请输入型号: "),
        'trim': input("请输入版本: "),
        'body': input("请输入车身类型: "),
        'transmission': input("请输入变速箱类型: "),
        'state': input("请输入车辆所在州: "),
        'condition': input("请输入车况评分 (0-4): "),
        'color': input("请输入颜色: "),
        'interior': input("请输入内饰颜色: "),
        'mmr': float(input("请输入 MMR 评分: ")), # 获取 MMR 数值
        'min_price': float(input("请输入最低价格: ")),
        'max_price': float(input("请输入最高价格: ")),
        'min_odo': float(input("请输入最低里程数: ")),
        'max_odo': float(input("请输入最高里程数: "))
    }
# 缩放用户输入的 MMR
user_input['mmr_scaled'] = mmr_scaler.transform([[user_input['mmr']]])[0][0] # 缩放用户输入的MMR

df['carId'] = df.index
# 尝试读取用户评分数据

# --- 读取用户评分数据 ---
try:
    existing_ratings = pd.read_csv("/content/ratings (1).csv")  # 或你的 ratings.csv 文件路径
except FileNotFoundError:
    existing_ratings = pd.DataFrame(columns=['userId', 'carId', 'rating'])


# ---  模拟更多用户评分  ---
# --- 模拟更多用户评分 ---
# if existing_ratings.empty or len(existing_ratings) < 50:
#     simulated_ratings = simulate_user_ratings(df, car_vectors, num_users=50, ratings_per_user=10)  # 生成更多模拟数据
#     existing_ratings = pd.concat([existing_ratings, simulated_ratings], ignore_index=True)
#     existing_ratings.to_csv("ratings.csv", index=False)
#     print("已模拟并保存用户评分数据到 ratings.csv")



# ---  核心推荐逻辑 ---
if existing_ratings.empty or len(existing_ratings) < 10:
    print("使用基于内容的推荐...")
    initial_recs = content_based_recommendation(user_input, all_car_vectors, top_n=5, # 传入 all_car_vectors
                                            min_price=user_input['min_price'],
                                            max_price=user_input['max_price'],
                                            min_odo=user_input['min_odo'],
                                            max_odo=user_input['max_odo'])

    if initial_recs is not None and not initial_recs.empty:
        if {'model', 'trim'}.issubset(initial_recs.columns):
            print("基于您输入的特征，我们推荐以下车辆：")
            print(initial_recs[['year', 'make', 'model', 'trim']])
            user_ratings = get_user_ratings(initial_recs)

            # 保存用户评分 (更新 existing_ratings)
            user_id = existing_ratings['userId'].max() + 1 if not existing_ratings.empty else 0
            ratings_df = pd.DataFrame({'userId': [user_id] * len(user_ratings),
                                        'carId': initial_recs.index,
                                        'rating': user_ratings})
            existing_ratings = pd.concat([existing_ratings, ratings_df], ignore_index=True)
            existing_ratings.to_csv("ratings.csv", index=False)

            print("\n根据您的评分，我们更新了推荐：")
            # 调用矩阵分解函数并保存模型
            matrix_model = matrix_factorization_recommendation(user_ratings, initial_recs, df.copy(), car_vectors, scaler, model_w2v, n_clusters, kmeans, features)
            matrix_model.save('matrix_factorization_model.h5')
            print("矩阵分解模型已保存为 matrix_factorization_model.h5")
        else:
            print("推荐结果中不包含 'model' 或 'trim' 列。")
    else:
        print("找不到符合您输入特征的车辆。")


else:  # 使用混合推荐
    print("使用混合推荐...")
    initial_recs = content_based_recommendation(user_input, all_car_vectors)  # 传入 all_car_vectors

    if initial_recs is not None and not initial_recs.empty:
        if {'model', 'trim'}.issubset(initial_recs.columns):
            print("基于您输入的特征，我们推荐以下车辆：")
            print(initial_recs[['year', 'make', 'model', 'trim']])
            user_ratings = get_user_ratings(initial_recs)

            # 调用矩阵分解函数并处理可能的 None 返回值
            result = matrix_factorization_recommendation(user_ratings, initial_recs, df.copy(), car_vectors, scaler, model_w2v, n_clusters, kmeans, features)

            if result: #检查返回值是否为空
                matrix_model, user2user_encoded, car2car_encoded, mean_rating = result
                matrix_model.save('matrix_factorization_model.h5')
                print("矩阵分解模型已保存为 matrix_factorization_model.h5")

                user_id = existing_ratings['userId'].max() if not existing_ratings.empty else 0 # 使用最大user_id或0
                final_recommendations = hybrid_recommendation(
                    user_id,
                    initial_recs.copy(),
                    matrix_model,
                    user2user_encoded,
                    car2car_encoded,
                    mean_rating,
                )
                print("\n最终混合推荐结果：")
                print(final_recommendations[['year', 'make', 'model', 'trim', 'combined_score']])

            else:
                print("矩阵分解推荐失败，可能没有足够的未观看车辆。") #处理返回值为空的情况


        else:
            print("推荐结果中不包含 'model' 或 'trim' 列。")
    else:
        print("找不到符合您输入特征的车辆。")

# 保存模型和scaler
joblib.dump(model_w2v, 'word2vec_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(kmeans, 'kmeans_model.pkl')

请输入品牌: BMW
请输入型号: camry
请输入版本: LE
请输入车身类型: Unknown
请输入变速箱类型: Automatic
请输入车辆所在州: ca
请输入车况评分 (0-4): 4
请输入颜色: black
请输入内饰颜色: gray
请输入 MMR 评分: 30000
请输入最低价格: 10000
请输入最高价格: 40000
请输入最低里程数: 0
请输入最高里程数: 50000




使用混合推荐...
基于您输入的特征，我们推荐以下车辆：
        year make model  trim
466005  2007  BMW    M6  Base
537582  2010  BMW    M6  Base
460532  2012  BMW  X5 M  Base
501928  2007  BMW  Z4 M  Base
467571  2011  BMW  X5 M  Base
请为以下推荐车辆打分（1-5）：
BMW M6 Base (2007): 4
BMW M6 Base (2010): 3
BMW X5 M Base (2012): 5
BMW Z4 M Base (2007): 4
BMW X5 M Base (2011): 3


  df_ratings['rating'] = df_ratings['rating'].fillna(mean_rating)


没有未观看的车辆可供推荐。 使用默认值。
矩阵分解模型已保存为 matrix_factorization_model.h5

最终混合推荐结果：
        year make model  trim  combined_score
466005  2007  BMW    M6  Base        2.893379
537582  2010  BMW    M6  Base        2.891422
467571  2011  BMW  X5 M  Base        2.868679
460532  2012  BMW  X5 M  Base        2.867665
501928  2007  BMW  Z4 M  Base        2.856661


['kmeans_model.pkl']