In [2]:
import pandas as pd
import numpy as np

# 读取CSV文件
file_path = 'StudentsPerformance.csv'
df = pd.read_csv(file_path)

# 显示数据集的前几行
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [3]:
# 编码 'gender' 列
df['gender'] = df['gender'].map({'female': 1, 'male': 0})

# 编码 'race/ethnicity' 列
race_mapping = {'group A': 0, 'group B': 1, 'group C': 2, 'group D': 3, 'group E': 4}
df['race/ethnicity'] = df['race/ethnicity'].map(race_mapping)

# 编码 'parental level of education' 列
edu_mapping = {'some high school': 0, 'high school': 1, 'some college': 2, "associate's degree": 3, "bachelor's degree": 4, "master's degree": 5}
df['parental level of education'] = df['parental level of education'].map(edu_mapping)

# 编码 'parental level of education' 列
df['lunch'] = df['lunch'].map({'standard': 1, 'free/reduced': 0})

# 编码 'test preparation course' 列
df['test preparation course'] = df['test preparation course'].map({'completed': 1, 'none': 0})

In [4]:
# 假设 df 是你的 DataFrame
df.drop(['reading score', 'writing score'], axis=1, inplace=True)
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score
0,1,1,4,1,0,72
1,1,2,2,1,1,69
2,1,1,5,1,0,90
3,0,0,3,0,0,47
4,0,2,2,1,0,76
...,...,...,...,...,...,...
995,1,4,5,1,1,88
996,0,2,1,0,0,62
997,1,2,1,0,1,59
998,1,3,2,1,1,68


In [5]:
df = df.round(2)
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score
0,1,1,4,1,0,72
1,1,2,2,1,1,69
2,1,1,5,1,0,90
3,0,0,3,0,0,47
4,0,2,2,1,0,76
...,...,...,...,...,...,...
995,1,4,5,1,1,88
996,0,2,1,0,0,62
997,1,2,1,0,1,59
998,1,3,2,1,1,68


In [6]:
# 查看Age列的分布情况
income_distribution = df['math score'].describe()

# 显示结果
print(income_distribution)

count    1000.00000
mean       66.08900
std        15.16308
min         0.00000
25%        57.00000
50%        66.00000
75%        77.00000
max       100.00000
Name: math score, dtype: float64


In [7]:
# 定义压缩函数
def compress_score(x):
    # 将Age映射到-1到1的范围
    return 2 * ((x - 0) / (100 - 0)) - 1

# 生成condensedAge列
df['condensedscore'] = df['math score'].apply(compress_score)

# 显示更新后的DataFrame
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,condensedscore
0,1,1,4,1,0,72,0.44
1,1,2,2,1,1,69,0.38
2,1,1,5,1,0,90,0.8
3,0,0,3,0,0,47,-0.06
4,0,2,2,1,0,76,0.52


In [8]:
# 创建 safeZoneLow 列
conditions = [
    df['condensedscore'] < -0.5,
    (df['condensedscore'] >= -0.5) & (df['condensedscore'] < 0),
    (df['condensedscore'] >= 0) & (df['condensedscore'] < 0.5),
    df['condensedscore'] >= 0.5
]
choices = [
    -1,
    -0.5,
    0,
    0.5
]
df['safeZoneLow'] = np.select(conditions, choices)

# 显示结果
print(df[['condensedscore', 'safeZoneLow']])

     condensedscore  safeZoneLow
0              0.44          0.0
1              0.38          0.0
2              0.80          0.5
3             -0.06         -0.5
4              0.52          0.5
..              ...          ...
995            0.76          0.5
996            0.24          0.0
997            0.18          0.0
998            0.36          0.0
999            0.54          0.5

[1000 rows x 2 columns]


In [9]:
# 创建 safeZoneUp 列
conditions_up = [
    df['condensedscore'] <= -0.5,
    (df['condensedscore'] > -0.5) & (df['condensedscore'] <= 0),
    (df['condensedscore'] > 0) & (df['condensedscore'] <= 0.5),
    df['condensedscore'] > 0.5
]
choices_up = [
    -0.5,
    0,
    0.5,
    1
]
df['safeZoneUp'] = np.select(conditions_up, choices_up)

# 显示结果
print(df[['condensedscore', 'safeZoneUp']])

     condensedscore  safeZoneUp
0              0.44         0.5
1              0.38         0.5
2              0.80         1.0
3             -0.06         0.0
4              0.52         1.0
..              ...         ...
995            0.76         1.0
996            0.24         0.5
997            0.18         0.5
998            0.36         0.5
999            0.54         1.0

[1000 rows x 2 columns]


In [10]:
# 创建 safeZone 列
df['safeZone'] = df['safeZoneUp'] - df['safeZoneLow']

df = df[(df['safeZone'] != 0)]

# 输出各个值的取值个数
value_counts = df['safeZone'].value_counts()

# 显示结果
print("safeZone 列各个值的取值个数:")
print(value_counts)

safeZone 列各个值的取值个数:
0.5    964
Name: safeZone, dtype: int64


In [11]:
# 将修改后的DataFrame保存为CSV文件
df.to_csv('modified_dataframe.csv', index=False)

# 打印成功保存的消息
print("修改后的DataFrame已保存为 modified_dataframe.csv")

修改后的DataFrame已保存为 modified_dataframe.csv
