In [1]:
import pandas as pd
import numpy as np

# 读取CSV文件
file_path = 'ocd_patient_dataset.csv'
df = pd.read_csv(file_path)

# 显示数据集的前几行
df

Unnamed: 0,Patient ID,Age,Gender,Ethnicity,Marital Status,Education Level,OCD Diagnosis Date,Duration of Symptoms (months),Previous Diagnoses,Family History of OCD,Obsession Type,Compulsion Type,Y-BOCS Score (Obsessions),Y-BOCS Score (Compulsions),Depression Diagnosis,Anxiety Diagnosis,Medications
0,1018,32,Female,African,Single,Some College,2016-07-15,203,MDD,No,Harm-related,Checking,17,10,Yes,Yes,SNRI
1,2406,69,Male,African,Divorced,Some College,2017-04-28,180,,Yes,Harm-related,Washing,21,25,Yes,Yes,SSRI
2,1188,57,Male,Hispanic,Divorced,College Degree,2018-02-02,173,MDD,No,Contamination,Checking,3,4,No,No,Benzodiazepine
3,6200,27,Female,Hispanic,Married,College Degree,2014-08-25,126,PTSD,Yes,Symmetry,Washing,14,28,Yes,Yes,SSRI
4,5824,56,Female,Hispanic,Married,High School,2022-02-20,168,PTSD,Yes,Hoarding,Ordering,39,18,No,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,5374,38,Male,Hispanic,Divorced,College Degree,2019-01-10,53,MDD,No,Contamination,Washing,21,33,Yes,Yes,SSRI
1496,5013,19,Female,Hispanic,Divorced,Graduate Degree,2022-09-14,160,GAD,Yes,Hoarding,Praying,25,16,Yes,Yes,SSRI
1497,6089,40,Male,Asian,Married,Some College,2018-03-13,100,,Yes,Contamination,Counting,2,15,Yes,Yes,Benzodiazepine
1498,3808,37,Female,Caucasian,Married,Some College,2018-04-14,210,GAD,Yes,Contamination,Washing,16,7,Yes,No,Benzodiazepine


In [2]:
# 编码 'Gender' 列
df['Gender'] = df['Gender'].map({'Female': 1, 'Male': 0})

# 将 "Ethnicity" 列映射为数字
ethnicity_mapping = {'African': 1, 'Hispanic': 2, 'Asian': 3, 'Caucasian': 4}
df['Ethnicity'] = df['Ethnicity'].map(ethnicity_mapping)

# 将 "Marital Status" 列映射为数字
marital_status_mapping = {'Single': 1, 'Divorced': 2, 'Married': 3}
df['Marital Status'] = df['Marital Status'].map(marital_status_mapping)

# 将 "Education Level" 列映射为数字
education_mapping = {'Some College': 1, 'High School': 2, 'College Degree': 3, 'Graduate Degree': 4}
df['Education Level'] = df['Education Level'].map(education_mapping)


# 删除 'MTRANS' 和 'NObeyesdad' 列
df = df.drop(['Patient ID','OCD Diagnosis Date','Previous Diagnoses','Family History of OCD','Obsession Type','Compulsion Type','Y-BOCS Score (Obsessions)','Y-BOCS Score (Compulsions)'], axis=1)
df = df.drop(['Depression Diagnosis','Anxiety Diagnosis','Medications'], axis=1)

In [3]:
df = df.round(2)
# 将列名修改为 "time"
df.rename(columns={'Duration of Symptoms (months)': 'time'}, inplace=True)
df

Unnamed: 0,Age,Gender,Ethnicity,Marital Status,Education Level,time
0,32,1,1,1,1,203
1,69,0,1,2,1,180
2,57,0,2,2,3,173
3,27,1,2,3,3,126
4,56,1,2,3,2,168
...,...,...,...,...,...,...
1495,38,0,2,2,3,53
1496,19,1,2,2,4,160
1497,40,0,3,3,1,100
1498,37,1,4,3,1,210


In [4]:
# 查看Age列的分布情况
income_distribution = df['time'].describe()

# 显示结果
print(income_distribution)

count    1500.000000
mean      121.745333
std        67.404610
min         6.000000
25%        64.000000
50%       121.000000
75%       178.000000
max       240.000000
Name: time, dtype: float64


In [5]:
# 定义压缩函数
def compress_age(x):
    # 将Age映射到-1到1的范围
    return 2 * ((x - 6) / (240 - 6)) - 1

# 生成condensedAge列
df['condensedtime'] = df['time'].apply(compress_age)

# 显示更新后的DataFrame
df.head()

Unnamed: 0,Age,Gender,Ethnicity,Marital Status,Education Level,time,condensedtime
0,32,1,1,1,1,203,0.683761
1,69,0,1,2,1,180,0.487179
2,57,0,2,2,3,173,0.42735
3,27,1,2,3,3,126,0.025641
4,56,1,2,3,2,168,0.384615


In [6]:
# 创建 safeZoneLow 列
conditions = [
    df['condensedtime'] < -0.5,
    (df['condensedtime'] >= -0.5) & (df['condensedtime'] < 0),
    (df['condensedtime'] >= 0) & (df['condensedtime'] < 0.5),
    df['condensedtime'] >= 0.5
]
choices = [
    -1,
    -0.5,
    0,
    0.5
]
df['safeZoneLow'] = np.select(conditions, choices)

# 显示结果
print(df[['condensedtime', 'safeZoneLow']])

      condensedtime  safeZoneLow
0          0.683761          0.5
1          0.487179          0.0
2          0.427350          0.0
3          0.025641          0.0
4          0.384615          0.0
...             ...          ...
1495      -0.598291         -1.0
1496       0.316239          0.0
1497      -0.196581         -0.5
1498       0.743590          0.5
1499      -0.273504         -0.5

[1500 rows x 2 columns]


In [7]:
# 创建 safeZoneUp 列
conditions_up = [
    df['condensedtime'] <= -0.5,
    (df['condensedtime'] > -0.5) & (df['condensedtime'] <= 0),
    (df['condensedtime'] > 0) & (df['condensedtime'] <= 0.5),
    df['condensedtime'] > 0.5
]
choices_up = [
    -0.5,
    0,
    0.5,
    1
]
df['safeZoneUp'] = np.select(conditions_up, choices_up)

# 显示结果
print(df[['condensedtime', 'safeZoneUp']])

      condensedtime  safeZoneUp
0          0.683761         1.0
1          0.487179         0.5
2          0.427350         0.5
3          0.025641         0.5
4          0.384615         0.5
...             ...         ...
1495      -0.598291        -0.5
1496       0.316239         0.5
1497      -0.196581         0.0
1498       0.743590         1.0
1499      -0.273504         0.0

[1500 rows x 2 columns]


In [8]:
# 创建 safeZone 列
df['safeZone'] = df['safeZoneUp'] - df['safeZoneLow']

df = df[(df['safeZone'] != 0)]

# 输出各个值的取值个数
value_counts = df['safeZone'].value_counts()

# 显示结果
print("safeZone 列各个值的取值个数:")
print(value_counts)

safeZone 列各个值的取值个数:
0.5    1497
Name: safeZone, dtype: int64


In [9]:
# 将修改后的DataFrame保存为CSV文件
df.to_csv('modified_dataframe.csv', index=False)

# 打印成功保存的消息
print("修改后的DataFrame已保存为 modified_dataframe.csv")

修改后的DataFrame已保存为 modified_dataframe.csv
