In [240]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [242]:
file_path = './input/train.csv'
home_data = pd.read_csv(file_path)

print(home_data.head())

   id  Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0   0               0.0         No                      6.0            4.0   
1   1               1.0         No                      7.0            3.0   
2   2               6.0        Yes                      1.0            0.0   
3   3               3.0         No                      7.0            3.0   
4   4               1.0         No                      4.0            4.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency Personality  
0                        No                 15.0             5.0   Extrovert  
1                        No                 10.0             8.0   Extrovert  
2                       NaN                  3.0             0.0   Introvert  
3                        No                 11.0             5.0   Extrovert  
4                        No                 13.0             NaN   Extrovert  


In [244]:
# 这场关键就是数据处理

def process_data(data):
    # 数据转换
    data['Stage_fear'] = data['Stage_fear'].map({'No': '0', 'Yes': '1'})
    data['Drained_after_socializing'] = data['Drained_after_socializing'].map({'No': '0', 'Yes': '1'})
    if 'Personality' in data.columns : data['Personality'] = data['Personality'].map({'Introvert': '0', 'Extrovert': '1'})

    # 数据描述
    print(data.head())
    print(data.describe())

    # 填充空值 依据其他非空值的均值
    data['Time_spent_Alone'] = data['Time_spent_Alone'].fillna(3.137764)
    data['Social_event_attendance'] = data['Social_event_attendance'].fillna(5.265106)
    data['Going_outside'] = data['Going_outside'].fillna(4.044319)
    data['Friends_circle_size'] = data['Friends_circle_size'].fillna(7.996737)
    data['Post_frequency'] = data['Post_frequency'].fillna(4.982097)
    
    data['Stage_fear'] = data['Stage_fear'].fillna(0.217124)
    data['Drained_after_socializing'] = data['Drained_after_socializing'].fillna(0.233784)

    # 数据描述
    print(data.head())
    
    return data

In [246]:
# 对数据进行处理
home_data = process_data(home_data)

   id  Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0   0               0.0          0                      6.0            4.0   
1   1               1.0          0                      7.0            3.0   
2   2               6.0          1                      1.0            0.0   
3   3               3.0          0                      7.0            3.0   
4   4               1.0          0                      4.0            4.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency Personality  
0                         0                 15.0             5.0           1  
1                         0                 10.0             8.0           1  
2                       NaN                  3.0             0.0           0  
3                         0                 11.0             5.0           1  
4                         0                 13.0             NaN           1  
                 id  Time_spent_Alone  Social_event_atten

In [248]:
# 查看列的信息
print(home_data.columns.to_list())

# 找 target
y = home_data.Personality

# 找 features 和 对应数据集 X
features = ['Time_spent_Alone', 
            'Stage_fear',                
            'Social_event_attendance', 
            'Going_outside', 
            'Drained_after_socializing', 
            'Friends_circle_size', 
            'Post_frequency']
X = home_data[features]

# 划分数据 训练集 + 验证集
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

# 定义随机森林模型
rf_model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
rf_model.fit(train_X, train_y)

# 通过验证集求解 MAE平均绝对误差
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print(rf_val_predictions)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

['id', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency', 'Personality']
['1' '1' '1' ... '1' '1' '1']
Validation MAE for Random Forest Model: 0


In [250]:
# 全训练数据上的模型

rf_model_on_full_data = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
rf_model_on_full_data.fit(X, y)

In [252]:
# 测试数据路径
test_data_path = './input/test.csv'

# 测试数据
test_data = pd.read_csv(test_data_path)

# 数据处理
test_data = process_data(test_data)

# 预测的输入集
test_X = test_data[features]

#预测
test_preds = rf_model_on_full_data.predict(test_X)

      id  Time_spent_Alone Stage_fear  Social_event_attendance  Going_outside  \
0  18524               3.0          0                      7.0            4.0   
1  18525               NaN          1                      0.0            0.0   
2  18526               3.0          0                      5.0            6.0   
3  18527               3.0          0                      4.0            4.0   
4  18528               9.0          1                      1.0            2.0   

  Drained_after_socializing  Friends_circle_size  Post_frequency  
0                         0                  6.0             NaN  
1                         1                  5.0             1.0  
2                         0                 15.0             9.0  
3                         0                  5.0             6.0  
4                         1                  1.0             1.0  
                 id  Time_spent_Alone  Social_event_attendance  Going_outside  \
count   6175.000000       5750

In [254]:
# 创建映射字典
personality_map = {'0': 'Introvert', '1': 'Extrovert'}

# 创建输出DataFrame
output = pd.DataFrame({
    'id': test_data['id'],
    'Personality': [personality_map[p] for p in test_preds]
})

# 保存结果
output.to_csv('submission.csv', index=False)
print("提交文件已生成！")

提交文件已生成！
