In [1]:
import pandas as pd
import numpy as np

# 读取CSV文件
file_path = 'ObesityDataSet_raw_and_data_sinthetic.csv'
df = pd.read_csv(file_path)

# 显示数据集的前几行
df

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [2]:
# 编码 'Gender' 列
df['Gender'] = df['Gender'].map({'Female': 1, 'Male': 0})

# 编码 'family_history_with_overweight' 列
df['family_history_with_overweight'] = df['family_history_with_overweight'].map({'yes': 1, 'no': 0})

# 编码 'FAVC' 列
df['FAVC'] = df['FAVC'].map({'yes': 1, 'no': 0})

# 编码 'SMOKE' 列
df['SMOKE'] = df['SMOKE'].map({'yes': 1, 'no': 0})

# 编码 'SCC' 列
df['SCC'] = df['SCC'].map({'yes': 1, 'no': 0})

# 编码 'CAEC' 列
caec_mapping = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
df['CAEC'] = df['CAEC'].map(caec_mapping)

# 编码 'CALC' 列
calc_mapping = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
df['CALC'] = df['CALC'].map(calc_mapping)

# 删除 'Normal_Weight' 和 'Insufficient_Weight' 的行
df = df[~df['NObeyesdad'].isin(['Normal_Weight', 'Insufficient_Weight'])]

# 删除 'MTRANS' 和 'NObeyesdad' 列
df = df.drop(['MTRANS', 'NObeyesdad'], axis=1)

In [3]:
# 查看Age列的分布情况
income_distribution = df['Age'].describe()

# 显示结果
print(income_distribution)

count    1552.000000
mean       25.582383
std         6.504869
min        15.000000
25%        21.000961
50%        24.000000
75%        28.770242
max        56.000000
Name: Age, dtype: float64


In [4]:
# 定义压缩函数
def compress_age(x):
    # 将Age映射到-1到1的范围
    return 2 * ((x - 15) / (56 - 15)) - 1

# 生成condensedAge列
df['condensedAge'] = df['Age'].apply(compress_age)

# 显示更新后的DataFrame
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,condensedAge
3,0,27.0,1.8,87.0,0,0,3.0,3.0,1,0,2.0,0,2.0,0.0,2,-0.414634
4,0,22.0,1.78,89.8,0,0,2.0,1.0,1,0,2.0,0,0.0,0.0,1,-0.658537
10,0,26.0,1.85,105.0,1,1,3.0,3.0,2,0,3.0,0,2.0,2.0,1,-0.463415
11,1,21.0,1.72,80.0,1,1,2.0,3.0,2,0,2.0,1,2.0,1.0,1,-0.707317
13,0,41.0,1.8,99.0,0,1,2.0,3.0,1,0,2.0,0,2.0,1.0,2,0.268293


In [5]:
# 创建 safeZoneLow 列
conditions = [
    df['condensedAge'] < -0.5,
    (df['condensedAge'] >= -0.5) & (df['condensedAge'] < 0),
    (df['condensedAge'] >= 0) & (df['condensedAge'] < 0.5),
    df['condensedAge'] >= 0.5
]
choices = [
    -1,
    -0.5,
    0,
    0.5
]
df['safeZoneLow'] = np.select(conditions, choices)

# 显示结果
print(df[['condensedAge', 'safeZoneLow']])

      condensedAge  safeZoneLow
3        -0.414634         -0.5
4        -0.658537         -1.0
10       -0.463415         -0.5
11       -0.707317         -1.0
13        0.268293          0.0
...            ...          ...
2106     -0.708447         -1.0
2107     -0.659369         -1.0
2108     -0.632974         -1.0
2109     -0.543320         -1.0
2110     -0.577331         -1.0

[1552 rows x 2 columns]


In [6]:
# 创建 safeZoneUp 列
conditions_up = [
    df['condensedAge'] <= -0.5,
    (df['condensedAge'] > -0.5) & (df['condensedAge'] <= 0),
    (df['condensedAge'] > 0) & (df['condensedAge'] <= 0.5),
    df['condensedAge'] > 0.5
]
choices_up = [
    -0.5,
    0,
    0.5,
    1
]
df['safeZoneUp'] = np.select(conditions_up, choices_up)

# 显示结果
print(df[['condensedAge', 'safeZoneUp']])

      condensedAge  safeZoneUp
3        -0.414634         0.0
4        -0.658537        -0.5
10       -0.463415         0.0
11       -0.707317        -0.5
13        0.268293         0.5
...            ...         ...
2106     -0.708447        -0.5
2107     -0.659369        -0.5
2108     -0.632974        -0.5
2109     -0.543320        -0.5
2110     -0.577331        -0.5

[1552 rows x 2 columns]


In [7]:
# 创建 safeZone 列
df['safeZone'] = df['safeZoneUp'] - df['safeZoneLow']

# 输出各个值的取值个数
value_counts = df['safeZone'].value_counts()

df = df[(df['safeZone'] != 0)]

# 显示结果
print("safeZone 列各个值的取值个数:")
print(value_counts)

safeZone 列各个值的取值个数:
0.5    1552
Name: safeZone, dtype: int64


In [8]:
# 将修改后的DataFrame保存为CSV文件
df.to_csv('modified_dataframe.csv', index=False)

# 打印成功保存的消息
print("修改后的DataFrame已保存为 modified_dataframe.csv")

修改后的DataFrame已保存为 modified_dataframe.csv
