In [None]:
# -*- coding: utf-8 -*-
"""0624-1.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1winBQSscLH--Vrp3pSQRbNnZDDgDoaZn
"""

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score, f1_score,make_scorer
import plotly.express as px
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.utils import to_categorical
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import joblib  # 用于保存和加载模型

# 读取数据
train_df = pd.read_csv('/content/drive/MyDrive/train2.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

# 数据预处理和特征工程
def preprocess_data(df, scaler=None):
    df = df.copy()

    # 分段并使用标签区分age
    age_bins = [0, 18, 30, 45, 60, 100]
    age_labels = ['Child', 'Young Adult', 'Adult', 'Middle Age', 'Senior']
    df['age_group'] = pd.cut(df['age'], bins=age_bins, labels=age_labels)
    df['age_group'] = df['age_group'].replace({'Child': 0, 'Young Adult': 1, 'Adult': 2, 'Middle Age': 3, 'Senior': 4}).astype(np.uint8)


    avg_glucose_level_bins = [0, 100, 125, 300]
    avg_glucose_level_labels = ['avg_glucose_level_C1', 'avg_glucose_level_C2', 'avg_glucose_level_C3']
    df['avg_glucose_level_group'] = pd.cut(df['avg_glucose_level'], bins=avg_glucose_level_bins, labels=avg_glucose_level_labels)
    df['avg_glucose_level_group'] = df['avg_glucose_level_group'].replace({'avg_glucose_level_C1': 0, 'avg_glucose_level_C2': 1, 'avg_glucose_level_C3': 2}).astype(np.uint8)


    # 将类别变量转换为数值变量
    df['gender'] = df['gender'].replace({'Male': 0, 'Female': 1, 'Other': -1}).astype(np.uint8)
    df['ever_married'] = df['ever_married'].replace({'No': 0, 'Yes': 1}).astype(np.uint8)
    df['Residence_type'] = df['Residence_type'].replace({'Rural': 0, 'Urban': 1}).astype(np.uint8)

    label_encoders = {}
    for column in ['work_type', 'smoking_status']:
      le = LabelEncoder()
      df[column] = le.fit_transform(df[column])
      label_encoders[column] = le

    # 使用get_dummies进行one-hot编码
    #df = pd.get_dummies(df, columns=['work_type', 'smoking_status'],dtype=int)
    #df
    # 填补bmi缺失值
    if df['bmi'].isnull().sum() > 0:
        X_bmi = df[['age', 'gender', 'bmi']].copy()
        Missing = X_bmi[X_bmi.bmi.isna()]
        X_bmi = X_bmi[~X_bmi.bmi.isna()]
        Y_bmi = X_bmi.pop('bmi')
        DT_bmi_pipe = Pipeline(steps=[
            ('scale', StandardScaler()),
            ('lr', DecisionTreeRegressor(random_state=42))
        ])
        DT_bmi_pipe.fit(X_bmi, Y_bmi)
        predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age', 'gender']]), index=Missing.index)
        df.loc[Missing.index, 'bmi'] = predicted_bmi.round(1)


    # 标准化
    if scaler:
        df[['age_log', 'avg_glucose_level_log', 'bmi_log']] = scaler.transform(df[['age', 'avg_glucose_level', 'bmi']])
    else:
        scaler = StandardScaler()
        df[['age_log', 'avg_glucose_level_log', 'bmi_log']] = scaler.fit_transform(df[['age', 'avg_glucose_level', 'bmi']])


    return df, scaler



# 将 'stroke' 列移到最后
def move_stroke_to_end(df):
    stroke_col = df.pop('stroke')
    df['stroke'] = stroke_col
    return df

# 移除 'work_type' 为 'children' 或 'Never_worked' 且 'bmi' 大于 50 的行
train_df = train_df[~( (train_df['avg_glucose_level'] <= 50) | (train_df['bmi'] >= 50)| (train_df['bmi'] <= 19) | (train_df['age'] < 50))]
train_df, scaler  = preprocess_data(train_df)
#train_df = train_df.drop(['work_type','smoking_status'], axis=1)  # 記得把處理完的原始特徵欄位移除
train_df = move_stroke_to_end(train_df)


#train_df = add_Risk_data(train_df)
print(test_df.shape)
test_df , _ = preprocess_data(test_df, scaler=scaler)
train_df.to_csv('/content/drive/MyDrive/result.csv', index=False)
#
train_df

test_df

#print(train_df.corr())
#X = X.drop(['stroke'], axis=1)  # 記得把處理完的原始特徵欄位移除
train_df

"""#https://innolux.webex.com/innolux-tc/url.php?gourl=https%3a%2f%2fscikit%2dlearn%2eorg%2fstable%2fmodules%2fgenerated%2fsklearn%2eutils%2eclass%5fweight%2ecompute%5fclass%5fweight%2ehtml

class weight
"""

# 定义特征和目标

X = train_df.drop(['id','stroke'], axis=1)
y = train_df['stroke']

X_test = test_df.drop(['id'], axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=200)

# 使用SMOTE处理类别不平衡
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# 定义随机森林分类器
rf_classifier = RandomForestClassifier(class_weight={0: 1, 1: 20},random_state=1000)

# 定义类别权重的设置，可以是 'balanced' 或手动指定权重字典
#class_weights = ['balanced', {0: 1, 1: 5}, {0: 1, 1: 10}]
#class_weights = [{0: 1, 1: w} for w in [5, 10, 15,20,25,30]]

#
# 定义要搜索的参数空间
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]

}

# 定义 F1-score 作为评分指标
scorer = make_scorer(f1_score)

# 初始化 GridSearchCV 或 RandomizedSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid,
                           scoring='f1', cv=5, verbose=1)

# 使用平衡后的训练集进行训练和搜索最佳参数
grid_search.fit(X_train_resampled, y_train_resampled)

# 打印最佳参数
print("Best parameters found: ", grid_search.best_params_)

# 使用最佳参数的模型进行预测
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_val)

# 計算準確率
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# 計算F1-score
f1 = f1_score(y_val, y_pred)
print(f"F1-score: {f1:.2f}")

# 评估
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

result = best_rf_model.predict(X_test)
result

# 生成提交文件
submission = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
submission['stroke'] = result
submission['stroke'] = submission['stroke'].map({1: True, 0: False})
submission.to_csv('/content/drive/MyDrive/submission_result.csv', index=False)





In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, precision_score, f1_score,make_scorer
import plotly.express as px
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.utils import to_categorical
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import joblib  # 用于保存和加载模型

ModuleNotFoundError: No module named 'tensorflow'