In [2]:
import sys
import os

# 获取当前notebook的绝对路径
notebook_dir = os.path.dirname(os.path.abspath('wind_data_analysis.ipynb'))
# src的绝对路径
src_path = os.path.abspath(os.path.join(notebook_dir, '../src'))
# 加入sys.path
if src_path not in sys.path:
    sys.path.insert(0, src_path)

import sweetviz as sv
import pandas as pd
import data_preprocessing as dp
from sklearn.model_selection import train_test_split

In [3]:
CITY_FOR_POWER_DATA = '阿拉善盟'
# Ensure CITY_NAME_MAPPING_DICT is accessible from dp
CITY_FOR_WEATHER_DATA = dp.CITY_NAME_MAPPING_DICT[CITY_FOR_POWER_DATA] 
TARGET_VARIABLE = 'wind_output'

# 1. Data Loading
weather_df = dp.get_history_weather_data_for_city(CITY_FOR_WEATHER_DATA)
power_df = dp.get_history_wind_power_for_city(CITY_FOR_POWER_DATA) # Assumes wind
merged_df = dp.merge_weather_and_power_df(weather_df, power_df)

# 2. Data Preprocessing
preprocessed_df = dp.preprocess_data(merged_df, CITY_FOR_POWER_DATA)
time_wise_df = dp.set_time_wise_feature(preprocessed_df.copy())

# 筛选数据
# start_date = pd.Timestamp('2025-01-01')
# time_wise_df = time_wise_df[time_wise_df.index >= start_date]

y = time_wise_df[TARGET_VARIABLE]
X_raw = time_wise_df.drop(columns=[TARGET_VARIABLE, 'time_idx', 'group_id'], errors='ignore')

# Define and one-hot encode categorical features
categorical_features_def = ['wind_season', 'year', 'month', 'day', 'hour']
actual_categorical_features = [col for col in categorical_features_def if col in X_raw.columns]

X_encoded = pd.get_dummies(X_raw, columns=actual_categorical_features, drop_first=True)
X_train_columns = X_encoded.columns.tolist() 

# 3. Data Splitting (Train-Validation-Test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, shuffle=False
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=False # 0.25 * 0.8 = 0.2 of total
)

No duplicate rows found
Initial missing timestamps: 72
Dealing with missing data.
Initial missing dates:
 2025-04-29    24
2025-04-30    24
2025-04-28    23
2025-05-01     1
Name: count, dtype: int64
Missing timestamps per day:
date
2024-01-01    0
2024-01-02    0
2024-01-03    0
2024-01-04    0
2024-01-05    0
             ..
2025-05-22    0
2025-05-23    0
2025-05-24    0
2025-05-25    0
2025-05-26    0
Length: 512, dtype: int64
Using threshold_percent=0.05, threshold_hours=1

Days to drop: 3
Total missing timestamps: 72

Missing timestamps by date:
2025-04-28    24
2025-04-29    24
2025-04-30    24
Name: count, dtype: int64


In [None]:
# 合并特征和标签，方便分析
train_df = X_train.copy()
train_df['wind_output'] = y_train

val_df = X_val.copy()
val_df['wind_output'] = y_val

test_df = X_test.copy()
test_df['wind_output'] = y_test

In [6]:
# 转换所有布尔型列为整数，解决 Sweetviz 的 'cannot use a single bool to index into setitem' 错误
def convert_bool_columns(df):
    df_copy = df.copy()
    for col in df_copy.select_dtypes(include=['bool']).columns:
        df_copy[col] = df_copy[col].astype(int)
    return df_copy

# 检查并打印布尔型列
bool_cols_train = train_df.select_dtypes(include=['bool']).columns.tolist()
print(f"布尔型列: {bool_cols_train}")

# 应用到所有数据框
train_df_fixed = convert_bool_columns(train_df)
val_df_fixed = convert_bool_columns(val_df)
test_df_fixed = convert_bool_columns(test_df)

# 检查是否还有布尔型列
bool_cols_after = train_df_fixed.select_dtypes(include=['bool']).columns.tolist()
print(f"转换后的布尔型列: {bool_cols_after}")

# 使用修复后的数据框进行 Sweetviz 分析
import sweetviz as sv

# 训练集 vs 验证集
report_train_val = sv.compare([train_df_fixed, "Train"], [val_df_fixed, "Validation"])
report_train_val.show_html("train_vs_val.html")

# 训练集 vs 测试集
report_train_test = sv.compare([train_df_fixed, "Train"], [test_df_fixed, "Test"])
report_train_test.show_html("train_vs_test.html")

# 验证集 vs 测试集
report_val_test = sv.compare([val_df_fixed, "Validation"], [test_df_fixed, "Test"])
report_val_test.show_html("val_vs_test.html")

布尔型列: ['wind_season_small', 'year_2025', 'month_10', 'month_11', 'month_12', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'day_10', 'day_11', 'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19', 'day_2', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26', 'day_27', 'day_28', 'day_29', 'day_3', 'day_30', 'day_31', 'day_4', 'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'hour_1', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_2', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9']
转换后的布尔型列: []


                                             |          | [  0%]   00:00 -> (? left)

  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)


Report train_vs_val.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


                                             |          | [  0%]   00:00 -> (? left)

  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)


Report train_vs_test.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


                                             |          | [  0%]   00:00 -> (? left)

  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)
  new_dataframe[feature] = pd.Series(dtype=float)


Report val_vs_test.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [7]:
solar_power_df = dp.get_history_solar_power_for_city(CITY_FOR_POWER_DATA) # Assumes wind


  output_df['solar_output'] = output_df['solar_output'].resample('H', closed='right', label='right').mean()


Unnamed: 0_level_0,solar_output
datetime,Unnamed: 1_level_1
2023-01-01 01:00:00,0.0
2023-01-01 02:00:00,0.0
2023-01-01 03:00:00,0.0
2023-01-01 04:00:00,0.0
2023-01-01 05:00:00,0.0
