In [1]:
import pandas as pd
import numpy as np

# 读取CSV文件
file_path = 'ds_salaries.csv'
df = pd.read_csv(file_path)

# 显示数据集的前几行
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...,...,...
3750,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
3751,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
3752,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
3753,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L


In [2]:
# 编码 'experience_level' 列
experience_mapping = {'SE': 1, 'MI': 2, 'EN': 3, 'EX': 4}
df['experience_level'] = df['experience_level'].map(experience_mapping)

# 将 "employment_type" 列映射为数字
employment_mapping = {'CT': 1, 'FL': 2, 'FT': 3, 'PT': 4}
df['employment_type'] = df['employment_type'].map(employment_mapping)

# 将 "company_size" 列映射为数字
company_size_mapping = {'L': 1, 'S': 2, 'M': 3}
df['company_size'] = df['company_size'].map(company_size_mapping)

# 删除指定列
columns_to_drop = ['salary_currency', 'salary_in_usd', 'employee_residence', 'company_location']
df.drop(columns=columns_to_drop, inplace=True)

In [3]:
# 假设 df 是你的 DataFrame
df.drop(['job_title'], axis=1, inplace=True)
df

Unnamed: 0,work_year,experience_level,employment_type,salary,remote_ratio,company_size
0,2023,1,3,80000,100,1
1,2023,2,1,30000,100,2
2,2023,2,1,25500,100,2
3,2023,1,3,175000,100,3
4,2023,1,3,120000,100,3
...,...,...,...,...,...,...
3750,2020,1,3,412000,100,1
3751,2021,2,3,151000,100,1
3752,2020,3,3,105000,100,2
3753,2020,3,1,100000,100,1


In [4]:
df = df.round(2)
df

Unnamed: 0,work_year,experience_level,employment_type,salary,remote_ratio,company_size
0,2023,1,3,80000,100,1
1,2023,2,1,30000,100,2
2,2023,2,1,25500,100,2
3,2023,1,3,175000,100,3
4,2023,1,3,120000,100,3
...,...,...,...,...,...,...
3750,2020,1,3,412000,100,1
3751,2021,2,3,151000,100,1
3752,2020,3,3,105000,100,2
3753,2020,3,1,100000,100,1


In [5]:
# 删除salary大于180000或小于100000的行
df = df[(df['salary'] <= 180000) & (df['salary'] >= 100000)]

# 查看Age列的分布情况
income_distribution = df['salary'].describe()

# 显示结果
print(income_distribution)

count      1902.000000
mean     137653.628812
std       22786.727775
min      100000.000000
25%      120000.000000
50%      137500.000000
75%      155000.000000
max      180000.000000
Name: salary, dtype: float64


In [6]:
# 定义压缩函数
def compress_income(x):
    # 将Msalary映射到-1到1的范围
    return 2 * ((x - 100000) / (180000 - 100000)) - 1

# 生成condensedMonthlyIncomeData列
df['condensedsalary'] = df['salary'].apply(compress_income)

# 显示更新后的DataFrame
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['condensedsalary'] = df['salary'].apply(compress_income)


Unnamed: 0,work_year,experience_level,employment_type,salary,remote_ratio,company_size,condensedsalary
3,2023,1,3,175000,100,3,0.875
4,2023,1,3,120000,100,3,-0.5
6,2023,1,3,136000,0,1,-0.1
8,2023,1,3,141000,0,3,0.025
9,2023,1,3,147100,0,3,0.1775


In [7]:
# 创建 safeZoneLow 列
conditions = [
    df['condensedsalary'] < -0.5,
    (df['condensedsalary'] >= -0.5) & (df['condensedsalary'] < 0),
    (df['condensedsalary'] >= 0) & (df['condensedsalary'] < 0.5),
    df['condensedsalary'] >= 0.5
]
choices = [
    -1,
    -0.5,
    0,
    0.5
]
df['safeZoneLow'] = np.select(conditions, choices)

# 显示结果
print(df[['condensedsalary', 'safeZoneLow']])

      condensedsalary  safeZoneLow
3              0.8750          0.5
4             -0.5000         -0.5
6             -0.1000         -0.5
8              0.0250          0.0
9              0.1775          0.0
...               ...          ...
3746           0.5000          0.5
3749           0.6250          0.5
3751           0.2750          0.0
3752          -0.8750         -1.0
3753          -1.0000         -1.0

[1902 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['safeZoneLow'] = np.select(conditions, choices)


In [8]:
# 创建 safeZoneUp 列
conditions_up = [
    df['condensedsalary'] <= -0.5,
    (df['condensedsalary'] > -0.5) & (df['condensedsalary'] <= 0),
    (df['condensedsalary'] > 0) & (df['condensedsalary'] <= 0.5),
    df['condensedsalary'] > 0.5
]
choices_up = [
    -0.5,
    0,
    0.5,
    1
]
df['safeZoneUp'] = np.select(conditions_up, choices_up)

# 显示结果
print(df[['condensedsalary', 'safeZoneUp']])

      condensedsalary  safeZoneUp
3              0.8750         1.0
4             -0.5000        -0.5
6             -0.1000         0.0
8              0.0250         0.5
9              0.1775         0.5
...               ...         ...
3746           0.5000         0.5
3749           0.6250         1.0
3751           0.2750         0.5
3752          -0.8750        -0.5
3753          -1.0000        -0.5

[1902 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['safeZoneUp'] = np.select(conditions_up, choices_up)


In [9]:
# 创建 safeZone 列
df['safeZone'] = df['safeZoneUp'] - df['safeZoneLow']

df = df[(df['safeZone'] != 0)]

# 输出各个值的取值个数
value_counts = df['safeZone'].value_counts()

# 显示结果
print("safeZone 列各个值的取值个数:")
print(value_counts)

safeZone 列各个值的取值个数:
0.5    1654
Name: safeZone, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['safeZone'] = df['safeZoneUp'] - df['safeZoneLow']


In [10]:
# 将修改后的DataFrame保存为CSV文件
df.to_csv('modified_dataframe.csv', index=False)

# 打印成功保存的消息
print("修改后的DataFrame已保存为 modified_dataframe.csv")

修改后的DataFrame已保存为 modified_dataframe.csv
