In [1]:
# -*- coding: utf-8 -*-
"""
睡眠质量分析与改善建议系统 - 核心代码实现
"""

import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

# 确保输出目录存在
os.makedirs('data', exist_ok=True)

# ==================== 1. 生成设备数据 ====================
def generate_device_data(num_users=30, days=14):
    """生成智能设备睡眠监测数据"""
    np.random.seed(42)
    base_date = datetime.now().replace(hour=0, minute=0, second=0)
    user_ids = [f'user_{i}' for i in range(1, num_users+1)]
    
    records = []
    for uid in user_ids:
        # 每个用户有自己的作息模式
        bedtime_mean = np.random.normal(22.5, 1.0)  # 平均入睡时间
        wakeup_mean = np.random.normal(6.5, 0.5)    # 平均醒来时间
        
        for day in range(days):
            date = base_date - timedelta(days=day)
            
            # 生成睡前记录（入睡前1小时）
            bedtime = bedtime_mean + np.random.normal(0, 0.5)
            records.append([
                uid,
                date + timedelta(hours=bedtime-1),
                np.random.normal(65, 5),  # heart_rate
                np.random.normal(1.8, 0.3),  # deep_sleep (预测值)
                np.random.normal(4.0, 0.5),  # light_sleep (预测值)
                np.random.normal(1.5, 0.2),  # rem_sleep (预测值)
                np.random.poisson(10),  # movement
                np.nan  # 总睡眠时长（醒来后填充）
            ])
            
            # 生成醒来记录
            wakeup = wakeup_mean + np.random.normal(0, 0.3)
            total_sleep = (wakeup - bedtime) % 12  # 计算睡眠时长
            records.append([
                uid,
                date + timedelta(hours=wakeup),
                np.random.normal(70, 5),  # heart_rate
                records[-1][3] + np.random.normal(0, 0.1),  # 实际深睡
                records[-1][4] + np.random.normal(0, 0.2),  # 实际浅睡
                records[-1][5] + np.random.normal(0, 0.1),  # 实际REM
                np.random.poisson(15),  # movement
                total_sleep  # 总睡眠时长
            ])
    
    df = pd.DataFrame(records, columns=[
        'user_id', 'timestamp', 'heart_rate', 'deep_sleep',
        'light_sleep', 'rem_sleep', 'movement', 'total_sleep'
    ])
    
    # 添加一些缺失值(5%)
    mask = np.random.rand(*df.shape) < 0.05
    df = df.mask(mask)
    
    df.to_csv('data/device_data.csv', index=False)
    print(f"生成设备数据：{len(df)}条记录，保存至 data/device_data.csv")

# ==================== 2. 生成问卷数据 ====================
def generate_survey_data(num_users=30, days=14):
    """生成每日睡眠问卷数据"""
    np.random.seed(42)
    base_date = datetime.now().replace(hour=0, minute=0, second=0)
    user_ids = [f'user_{i}' for i in range(1, num_users+1)]
    
    records = []
    for uid in user_ids:
        # 用户基础特征
        base_stress = np.random.normal(5, 2)
        base_caffeine = np.random.randint(1, 6)
        base_exercise = np.random.randint(2, 6)
        
        for day in range(days):
            date = base_date - timedelta(days=day)
            
            # 根据设备数据生成相关问卷答案
            records.append([
                uid,
                date.date(),
                min(10, max(1, int(base_stress + np.random.normal(0, 1.5)))),  # stress_level
                min(10, max(0, int(base_caffeine + np.random.poisson(1)))),    # caffeine_intake
                min(7, max(0, int(base_exercise + np.random.normal(0, 1)))),   # exercise_freq
                np.random.uniform(0.5, 4.0),                                 # screen_time
                np.random.normal(22.5, 1.5),                                 # bedtime_hour
                np.random.choice([1,2,3], p=[0.3,0.5,0.2])                   # sleep_quality
            ])
    
    df = pd.DataFrame(records, columns=[
        'user_id', 'survey_date', 'stress_level', 'caffeine_intake',
        'exercise_freq', 'screen_time', 'bedtime_hour', 'sleep_quality'
    ])
    
    # 添加10%缺失值
    mask = np.random.rand(*df.shape) < 0.1
    df = df.mask(mask)
    
    df.to_excel('data/survey_data.xlsx', index=False)
    print(f"生成问卷数据：{len(df)}条记录，保存至 data/survey_data.xlsx")

# ==================== 3. 生成环境数据 ====================
def generate_environment_data(num_users=30, days=14):
    """生成卧室环境监测数据"""
    np.random.seed(42)
    base_date = datetime.now().replace(hour=0, minute=0, second=0)
    user_ids = [f'user_{i}' for i in range(1, num_users+1)]
    
    records = []
    for uid in user_ids:
        # 用户基础环境特征
        base_temp = np.random.normal(22, 2)
        base_humidity = np.random.normal(50, 10)
        base_noise = np.random.normal(40, 5)
        
        for day in range(days):
            date = base_date - timedelta(days=day)
            
            # 生成睡前2小时的环境记录
            for hour_offset in [20, 21, 22]:  # 晚上8点、9点、10点
                records.append([
                    uid,
                    date + timedelta(hours=hour_offset),
                    base_temp + np.random.normal(0, 0.5),        # temperature
                    base_humidity + np.random.normal(0, 3),     # humidity
                    max(30, base_noise + np.random.normal(0, 2))  # noise_level
                ])
    
    df = pd.DataFrame(records, columns=[
        'user_id', 'record_time', 'temperature', 'humidity', 'noise_level'
    ])
    
    # 添加5%缺失值
    mask = np.random.rand(*df.shape) < 0.05
    df = df.mask(mask)
    
    df.to_csv('data/environment_data.csv', index=False)
    print(f"生成环境数据：{len(df)}条记录，保存至 data/environment_data.csv")

# ==================== 执行生成 ====================
if __name__ == '__main__':
    print("开始生成模拟数据...")
    generate_device_data()
    generate_survey_data()
    generate_environment_data()
    print("\n所有数据已生成完毕，保存在data目录下")
    print("文件结构：")
    print("├── data/device_data.csv")
    print("├── data/survey_data.xlsx")
    print("└── data/environment_data.csv")

开始生成模拟数据...
生成设备数据：840条记录，保存至 data/device_data.csv
生成问卷数据：420条记录，保存至 data/survey_data.xlsx
生成环境数据：1260条记录，保存至 data/environment_data.csv

所有数据已生成完毕，保存在data目录下
文件结构：
├── data/device_data.csv
├── data/survey_data.xlsx
└── data/environment_data.csv


In [3]:
# ==================== 1. 数据加载模块 ====================
class DataLoader:
    def __init__(self):
        self.device_data_path = 'data/device_data.csv'
        self.survey_data_path = 'data/survey_data.xlsx'
        self.env_data_path = 'data/environment_data.csv'
    
    def load_all_data(self):
        """加载所有数据源并初步整合"""
        # 读取设备数据
        device_df = pd.read_csv(self.device_data_path, parse_dates=['timestamp'])
        device_df = device_df.sort_values(by=['user_id', 'timestamp'])
        
        # 读取问卷数据
        survey_df = pd.read_excel(self.survey_data_path)
        survey_df['survey_date'] = pd.to_datetime(survey_df['survey_date'])
        
        # 读取环境数据
        env_df = pd.read_csv(self.env_data_path, parse_dates=['record_time'])
        env_df = env_df.sort_values(by=['user_id', 'record_time'])
        
        return device_df, survey_df, env_df
    
    def merge_data(self, device_df, survey_df, env_df):
        """安全可靠的数据合并方法（根据实际数据修正）"""
        try:
            # 1. 设备数据预处理
            device_df = device_df.copy()
            device_df['date'] = pd.to_datetime(device_df['timestamp'].dt.date)
            device_daily = device_df.groupby(['user_id', 'date']).mean(numeric_only=True).reset_index()
            
            # 2. 问卷数据预处理（处理NaT日期）
            survey_df = survey_df.copy()
            survey_df = survey_df[survey_df['survey_date'].notna()]  # 过滤无效日期
            survey_df['date'] = pd.to_datetime(survey_df['survey_date'].dt.date)
            
            # 3. 环境数据预处理
            env_df = env_df.copy()
            env_df['date'] = pd.to_datetime(env_df['record_time'].dt.date)
            env_daily = env_df.groupby(['user_id', 'date']).mean(numeric_only=True).reset_index()
            
            # 4. 分步合并（设备和问卷）
            merged_df = pd.merge(
                device_daily,
                survey_df.drop(columns=['survey_date']),
                on=['user_id', 'date'],
                how='left'  # 保留所有设备记录
            )
            
            # 5. 合并环境数据
            final_df = pd.merge(
                merged_df,
                env_daily,
                on=['user_id', 'date'],
                how='left'
            )
            
            # 6. 清理临时列并返回
            return final_df.drop(columns=['date'])
            
        except Exception as e:
            print(f"合并失败: {str(e)}")
            print("设备数据列:", device_df.columns.tolist())
            print("问卷数据列:", survey_df.columns.tolist())
            print("环境数据列:", env_df.columns.tolist())
            return None

In [4]:
# ==================== 2. 数据预处理模块 ====================
class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.imputer = IterativeImputer(random_state=42)
        self.encoder = OneHotEncoder(handle_unknown='ignore')
    
    def handle_missing_values(self, df):
        """处理缺失值"""
        # 设备数据缺失处理
        device_cols = ['heart_rate', 'deep_sleep', 'light_sleep', 'rem_sleep', 'movement']
        df[device_cols] = self.imputer.fit_transform(df[device_cols])
        
        # 问卷数据缺失处理
        survey_cols = ['stress_level', 'caffeine_intake', 'exercise_freq']
        for col in survey_cols:
            df[col].fillna(df[col].median(), inplace=True)
        
        # 环境数据缺失处理
        env_cols = ['temperature', 'humidity', 'noise_level']
        df[env_cols] = df[env_cols].interpolate(method='linear')
        
        return df
    
    def detect_outliers(self, df):
        """异常值检测与处理"""
        # 生理参数合理范围检测
        df = df[(df['heart_rate'] >= 40) & (df['heart_rate'] <= 120)]
        df = df[(df['deep_sleep'] >= 0.5) & (df['deep_sleep'] <= 4)]
        
        # 使用孤立森林检测多变量异常
        clf = IsolationForest(contamination=0.05, random_state=42)
        features = ['heart_rate', 'deep_sleep', 'light_sleep', 'movement']
        outliers = clf.fit_predict(df[features])
        df = df[outliers == 1]
        
        return df
    
    def feature_engineering(self, df):
        """特征工程"""
        # 计算睡眠效率
        df['sleep_efficiency'] = df['deep_sleep'] / df['total_sleep']
        
        # 计算昼夜节律指标
        df['bedtime_deviation'] = abs(df['bedtime_hour'] - 22.5)  # 理想入睡时间22:30
        
        # 构造生活习惯综合指标
        df['lifestyle_score'] = (
            0.3 * df['exercise_freq'] + 
            0.2 * (1 - df['caffeine_intake']/10) + 
            0.5 * (1 - df['screen_time']/5)
        )
        
        # 环境舒适度指标
        df['env_comfort'] = (
            (df['temperature'].between(20, 23)).astype(int) +
            (df['humidity'].between(40, 60)).astype(int) +
            (df['noise_level'] < 45).astype(int)
        ) / 3
        
        return df
    
    def preprocess_pipeline(self, df):
        """完整预处理流水线"""
        df = self.handle_missing_values(df)
        df = self.detect_outliers(df)
        df = self.feature_engineering(df)
        return df

In [5]:
# ==================== 3. 分析建模模块 ====================
class SleepAnalyzer:
    def __init__(self):
        self.models = {
            'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
            'XGBoost': XGBClassifier(random_state=42),
        }
    
    def prepare_data(self, df):
        """准备建模数据 - 添加列存在性检查"""
        # 定义特征和目标变量
        required_cols = [
            'deep_sleep', 'light_sleep', 'rem_sleep', 'heart_rate',
            'movement', 'stress_level', 'caffeine_intake', 'exercise_freq',
            'sleep_efficiency', 'bedtime_deviation', 'lifestyle_score', 'env_comfort',
            'sleep_quality'
        ]

        # 检查缺失列
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"缺失必要列: {missing_cols}")

        # 删除目标变量为NaN的行并将标签转换为从0开始
        df = df.dropna(subset=['sleep_quality'])
        df['sleep_quality'] = df['sleep_quality'].astype(int) - 1  # 将[1,2,3]转换为[0,1,2]

        features = required_cols[:-1]  # 最后一列是目标变量
        target = 'sleep_quality'

        X = df[features]
        y = df[target]

        # 数据标准化
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        return train_test_split(X_scaled, y, test_size=0.2, random_state=42), features
    
    def train_models(self, X_train, X_test, y_train, y_test, features):
        """训练和评估模型 - 添加features参数"""
        results = {}
    
        for name, model in self.models.items():
            # 为XGBoost设置类别数量
            if name == 'XGBoost':
                model.set_params(num_class=len(np.unique(y_train)))

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            report = classification_report(y_test, y_pred, output_dict=True)
            results[name] = {
                'accuracy': report['accuracy'],
                'precision': report['weighted avg']['precision'],
                'recall': report['weighted avg']['recall'],
                'f1': report['weighted avg']['f1-score']
            }

            # 特征重要性分析（需要特征名称）
            if hasattr(model, 'feature_importances_'):
                importances = model.feature_importances_
                results[name]['feature_importance'] = dict(zip(features, importances))

        return results
    
    def cluster_analysis(self, df, n_clusters=3):
        """聚类分析"""
        # 选择聚类特征
        cluster_features = [
            'deep_sleep', 'sleep_efficiency', 'bedtime_deviation',
            'movement', 'stress_level'
        ]

        # 检查并处理缺失值
        df_cluster = df[cluster_features].copy()
        if df_cluster.isna().any().any():
            print("警告：聚类特征中存在缺失值，将进行填充")
            df_cluster = df_cluster.fillna(df_cluster.median())

        # 数据标准化
        X_scaled = StandardScaler().fit_transform(df_cluster)

        # K-means聚类
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(X_scaled)

        # 分析聚类特征
        df['cluster'] = clusters
        cluster_profiles = df.groupby('cluster')[cluster_features].mean()

        return df, cluster_profiles

In [6]:
# ==================== 4. 可视化模块 ====================
class SleepVisualizer:
    def plot_sleep_stages(self, df):
        """绘制睡眠阶段雷达图"""
        fig = go.Figure()
        
        for quality in df['sleep_quality'].unique():
            subset = df[df['sleep_quality'] == quality]
            fig.add_trace(go.Scatterpolar(
                r=[
                    subset['deep_sleep'].mean(),
                    subset['light_sleep'].mean(),
                    subset['rem_sleep'].mean(),
                    subset['movement'].mean()
                ],
                theta=['深睡', '浅睡', 'REM睡眠', '体动'],
                fill='toself',
                name=f'睡眠质量{quality}'
            ))
        
        fig.update_layout(
            polar=dict(radialaxis=dict(visible=True)),
            showlegend=True,
            title='不同睡眠质量人群的睡眠阶段分布'
        )
        return fig
    
    def plot_correlation_heatmap(self, df):
        """绘制相关性热力图"""
        numeric_cols = df.select_dtypes(include=np.number).columns
        corr_matrix = df[numeric_cols].corr()
        
        fig = px.imshow(
            corr_matrix,
            text_auto=True,
            color_continuous_scale='RdBu',
            range_color=[-1, 1],
            title='睡眠影响因素相关性热力图'
        )
        return fig
    
    def plot_cluster_profiles(self, cluster_profiles):
        """绘制聚类特征剖面图"""
        fig = make_subplots(rows=1, cols=len(cluster_profiles.columns), 
                           subplot_titles=cluster_profiles.columns)
        
        for i, col in enumerate(cluster_profiles.columns, 1):
            fig.add_trace(
                go.Bar(
                    x=cluster_profiles.index,
                    y=cluster_profiles[col],
                    name=col
                ),
                row=1, col=i
            )
        
        fig.update_layout(
            height=400,
            showlegend=False,
            title_text='不同睡眠人群的特征剖面'
        )
        return fig


In [7]:
# ==================== 5. 主执行流程 ====================
if __name__ == '__main__':
    print("=== 睡眠质量分析系统开始运行 ===")
    
    # 1. 数据加载
    print("加载数据...")
    loader = DataLoader()
    device_df, survey_df, env_df = loader.load_all_data()
    merged_df = loader.merge_data(device_df, survey_df, env_df)
    
    # 2. 数据预处理
    print("预处理数据...")
    preprocessor = DataPreprocessor()
    processed_df = preprocessor.preprocess_pipeline(merged_df)
    
    # 3. 分析建模
    print("进行分析建模...")
    analyzer = SleepAnalyzer()
    
    # 3.1 预测建模
    (X_train, X_test, y_train, y_test), features = analyzer.prepare_data(processed_df)
    model_results = analyzer.train_models(X_train, X_test, y_train, y_test, features)
    print("\n模型性能对比:")
    print(pd.DataFrame(model_results).T)
    
    # 3.2 聚类分析
    clustered_df, cluster_profiles = analyzer.cluster_analysis(processed_df)
    print("\n聚类特征分析:")
    print(cluster_profiles)
    
    # 4. 可视化
    print("生成可视化图表...")
    visualizer = SleepVisualizer()
    
    # 保存可视化结果
    fig1 = visualizer.plot_sleep_stages(clustered_df)
    fig1.write_html("output/sleep_stages_radar.html")
    
    fig2 = visualizer.plot_correlation_heatmap(processed_df)
    fig2.write_html("output/correlation_heatmap.html")
    
    fig3 = visualizer.plot_cluster_profiles(cluster_profiles)
    fig3.write_html("output/cluster_profiles.html")
    
    print("分析完成! 结果已保存到output目录")

=== 睡眠质量分析系统开始运行 ===
加载数据...
预处理数据...
进行分析建模...

模型性能对比:
              accuracy precision    recall        f1  \
RandomForest  0.372881  0.302275  0.372881  0.330102   
XGBoost       0.305085  0.279789  0.305085  0.280801   

                                             feature_importance  
RandomForest  {'deep_sleep': 0.0968934532969116, 'light_slee...  
XGBoost       {'deep_sleep': 0.069998905, 'light_sleep': 0.0...  
警告：聚类特征中存在缺失值，将进行填充

聚类特征分析:
         deep_sleep  sleep_efficiency  bedtime_deviation   movement  \
cluster                                                               
0          1.834008          0.229896           2.329850  12.053480   
1          1.993949          0.261292           0.807091  12.887571   
2          1.556376          0.183495           0.713439  12.079363   

         stress_level  
cluster                
0            5.876712  
1            4.597561  
2            5.000000  
生成可视化图表...
分析完成! 结果已保存到output目录
