## Bigfive traits rescoring with reverse-coded items
To correctly reverse-score negatively keyed items and recompute average trait scores.

In [3]:
import pandas as pd

df = pd.read_csv('../data/cleaned/cleaned_bigfive_v1.csv')
df.shape

(602587, 126)

In [4]:
df.head()

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,NRT2,NRT3,NRT4,NRT5,NRT6,NRT7,NRT8,NRT9,NRT10,Neuroticism
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,...,2.0,2.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.6
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,...,3.0,2.0,5.0,3.0,5.0,4.0,5.0,3.0,5.0,3.9
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,...,2.0,2.0,4.0,4.0,4.0,4.0,4.0,5.0,3.0,3.4
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,...,3.0,3.0,4.0,3.0,4.0,4.0,4.0,2.0,3.0,3.3
4,3.0,3.0,4.0,2.0,4.0,2.0,2.0,3.0,3.0,4.0,...,2.0,3.0,4.0,4.0,5.0,4.0,5.0,4.0,4.0,3.8


In [5]:
# Reverse-coded items (from 03_reliability_analysis; checked also with https://ipip.ori.org/new_ipip-50-item-scale.htm)

reverse_items = ['OPN2', 'OPN4', 'OPN6',
                 'CSN2', 'CSN4', 'CSN6', 'CSN8',
                 'EXT2', 'EXT4', 'EXT6', 'EXT8', 'EXT10',
                 'AGR1', 'AGR3', 'AGR5', 'AGR7',
                 'NRT2', 'NRT4']

print('Reverse-coded items:', len(reverse_items))

Reverse-coded items: 18


In [6]:
df_before = df[reverse_items].copy()

In [7]:
# Reverse score: Lickert 1-5 becomes 6 - x

for item in reverse_items:
    if item in df.columns: 
        df[item] = 6 - df[item]


In [8]:
# Compare before and after for first 5 items
for item in reverse_items[:5]:
    print(f"\n{item} (before vs after):")
    print(pd.DataFrame({
        "before": df_before[item].head(),
        "after": df[item].head(),
        "sum(before+after)": (df_before[item] + df[item]).head()
    }))



OPN2 (before vs after):
   before  after  sum(before+after)
0     1.0    5.0                6.0
1     2.0    4.0                6.0
2     1.0    5.0                6.0
3     2.0    4.0                6.0
4     1.0    5.0                6.0

OPN4 (before vs after):
   before  after  sum(before+after)
0     1.0    5.0                6.0
1     2.0    4.0                6.0
2     1.0    5.0                6.0
3     2.0    4.0                6.0
4     1.0    5.0                6.0

OPN6 (before vs after):
   before  after  sum(before+after)
0     1.0    5.0                6.0
1     1.0    5.0                6.0
2     2.0    4.0                6.0
3     1.0    5.0                6.0
4     1.0    5.0                6.0

CSN2 (before vs after):
   before  after  sum(before+after)
0     4.0    2.0                6.0
1     2.0    4.0                6.0
2     2.0    4.0                6.0
3     4.0    2.0                6.0
4     2.0    4.0                6.0

CSN4 (before vs after):
   before  

In [9]:
# Ensure all pairs add up to 6
for item in reverse_items:
    if item in df.columns:
        assert ((df_before[item] + df[item]) == 6).all(), f"Reverse scoring failed for {item}"

print("✅ All reverse-coded items confirmed (before + after = 6)")


✅ All reverse-coded items confirmed (before + after = 6)


In [10]:
df["Extraversion"] = df[[f"EXT{i}" for i in range(1, 11)]].mean(axis=1)
df["Agreeableness"] = df[[f"AGR{i}" for i in range(1, 11)]].mean(axis=1)
df["Conscientiousness"] = df[[f"CSN{i}" for i in range(1, 11)]].mean(axis=1)
df["Openness"] = df[[f"OPN{i}" for i in range(1, 11)]].mean(axis=1)
df["Neuroticism"] = df[[f"NRT{i}" for i in range(1, 11)]].mean(axis=1)

df[["Extraversion", "Agreeableness", "Conscientiousness", "Openness", "Neuroticism"]].describe()


Unnamed: 0,Extraversion,Agreeableness,Conscientiousness,Openness,Neuroticism
count,602587.0,602587.0,602587.0,602587.0,602587.0
mean,2.913734,3.758781,3.342563,3.93902,2.918623
std,0.911304,0.736429,0.739131,0.618234,0.860687
min,1.0,1.0,1.0,1.0,1.0
25%,2.2,3.3,2.8,3.5,2.3
50%,2.9,3.9,3.3,4.0,2.9
75%,3.6,4.3,3.9,4.4,3.5
max,5.0,5.0,5.0,5.0,5.0


In [11]:
# Compute trait means from original (before rescoring)
df_before_traits = pd.DataFrame({
    "Extraversion": df_before[[f"EXT{i}" for i in range(1, 11) if f"EXT{i}" in df_before.columns]].mean(axis=1),
    "Agreeableness": df_before[[f"AGR{i}" for i in range(1, 11) if f"AGR{i}" in df_before.columns]].mean(axis=1),
    "Conscientiousness": df_before[[f"CSN{i}" for i in range(1, 11) if f"CSN{i}" in df_before.columns]].mean(axis=1),
    "Openness": df_before[[f"OPN{i}" for i in range(1, 11) if f"OPN{i}" in df_before.columns]].mean(axis=1),
})

# Old Neuroticism was already correct in v1, so just copy from v1
df_before_traits["Neuroticism"] = df["Neuroticism"]

# Create comparison table
before_means = df_before_traits.mean().round(3)
after_means = df[["Extraversion","Agreeableness","Conscientiousness","Openness","Neuroticism"]].mean().round(3)

comparison_df = pd.DataFrame({"Before": before_means, "After": after_means})
comparison_df["Difference"] = (comparison_df["After"] - comparison_df["Before"]).round(3)

print("Trait Mean Comparison (Before vs After Rescoring):")
comparison_df


Trait Mean Comparison (Before vs After Rescoring):


Unnamed: 0,Before,After,Difference
Extraversion,3.117,2.914,-0.203
Agreeableness,2.261,3.759,1.498
Conscientiousness,2.751,3.343,0.592
Openness,1.95,3.939,1.989
Neuroticism,2.919,2.919,0.0


In [12]:
output_path = "../data/cleaned/cleaned_bigfive_v2.csv"
df.to_csv(output_path, index=False)
print(f"✅ New dataset saved: {output_path}")


✅ New dataset saved: ../data/cleaned/cleaned_bigfive_v2.csv
