# Exploring the BIG5 dataset from the [Open-Source Psychometrics Project](https://openpsychometrics.org/)



In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

In [2]:
big5_df = pd.read_csv('data/openpsych_data.csv', sep='\t')

### `race`
Chosen from a drop down menu. 
```
1=Mixed Race, 2=Arctic (Siberian, Eskimo), 3=Caucasian (European), 4=Caucasian (Indian), 5=Caucasian (Middle East), 6=Caucasian (North African, Other), 7=Indigenous Australian, 8=Native American, 9=North East Asian (Mongol, Tibetan, Korean Japanese, etc), 10=Pacific (Polynesian, Micronesian, etc), 11=South East Asian (Chinese, Thai, Malay, Filipino, etc), 12=West African, Bushmen, Ethiopian, 13=Other (0=missed)
```

In [3]:
race_values = '''1=Mixed Race, 2=Arctic (Siberian, Eskimo), 3=Caucasian (European), 
4=Caucasian (Indian), 5=Caucasian (Middle East), 6=Caucasian (North African, Other), 
7=Indigenous Australian, 8=Native American, 
9=North East Asian (Mongol, Tibetan, Korean Japanese, etc), 
10=Pacific (Polynesian, Micronesian, etc), 
11=South East Asian (Chinese, Thai, Malay, Filipino, etc), 12=West African, Bushmen, Ethiopian, 13=Other, 0=missed
'''

parts = re.split(r'(?:,\s+)?([0-9]+)=', race_values.strip())

race_keys = [int(key_val) for key_val in parts[1::2]]
race_values = parts[2::2]

In [4]:
race_mapping = dict(zip(race_keys,race_values))

In [5]:
big5_df=big5_df.assign(race_cat=big5_df['race'].map(race_mapping))

### `age`

age	entered as text (individuals reporting age < 13 were not recorded)


In [6]:
big5_df['age'].max()

np.int64(999999999)

In [7]:
big5_df['age'].value_counts().tail(50)

age
68           20
67           19
69           15
70           12
71           11
1992          9
72            8
1994          8
1996          7
1995          5
1993          5
75            5
1989          5
1997          4
1982          4
1998          4
1991          3
1990          3
77            3
188           2
1976          2
1984          2
74            2
76            2
73            2
1986          2
79            2
1985          2
1999          1
1988          1
100           1
208           1
999999999     1
1961          1
1977          1
412434        1
92            1
2000          1
80            1
1974          1
97            1
1968          1
211           1
223           1
99            1
266           1
191           1
78            1
1964          1
118           1
Name: count, dtype: int64

In [8]:
big5_df[big5_df['age']>100]['age'].value_counts()

age
1992         9
1994         8
1996         7
1995         5
1993         5
1989         5
1997         4
1982         4
1998         4
1991         3
1990         3
1984         2
1985         2
188          2
1976         2
1986         2
999999999    1
208          1
1988         1
1999         1
1977         1
2000         1
1961         1
412434       1
1974         1
1968         1
211          1
223          1
266          1
191          1
1964         1
118          1
Name: count, dtype: int64

### `gender`
gender	Chosen from a drop down menu. 1=Male, 2=Female, 3=Other (0=missed)



In [9]:
gender_map = {
    0: pd.NA,
    1: 'male',
    2: 'female',
    3: 'other'
}

In [10]:
big5_df['gender'].value_counts()

gender
2    11985
1     7608
3      102
0       24
Name: count, dtype: int64

In [11]:
big5_df=big5_df.assign(gender_cat=big5_df['gender'].map(gender_map))

In [12]:
big5_df['gender_cat'].value_counts()

gender_cat
female    11985
male       7608
other       102
Name: count, dtype: int64

### `hand`

hand	"What hand do you use to write with?". 1=Right, 2=Left, 3=Both (0=missed)


In [13]:
big5_df['hand'].value_counts()

hand
1    17424
2     1724
3      471
0      100
Name: count, dtype: int64

### `source`

How the participant came to the test. Based on HTTP Referer. 1=from another page on the test website, 2=from google, 3=from facebook, 4=from any url with ".edu" in its domain name (e.g. xxx.edu, xxx.edu.au), 6=other source, or HTTP Referer not provided.

In [14]:
big5_df['source'].value_counts()

source
1    12099
2     3653
5     3527
3      303
4      137
Name: count, dtype: int64

### `country`

The participant's technical location. ISO country code.

In [15]:
big5_df['country'].value_counts().head(20)

country
US     8753
GB     1531
IN     1464
AU      974
CA      924
PH      649
(nu     369
IT      277
MY      247
PK      222
DE      191
ZA      179
BR      175
ID      172
SE      169
NZ      157
NO      147
RO      135
SG      133
NL      133
Name: count, dtype: int64

In [16]:
big5_df['country'].nunique()

158

In [17]:
questions = '''
E1	I am the life of the party.
E2	I don't talk a lot.
E3	I feel comfortable around people.
E4	I keep in the background.
E5	I start conversations.
E6	I have little to say.
E7	I talk to a lot of different people at parties.
E8	I don't like to draw attention to myself.
E9	I don't mind being the center of attention.
E10	I am quiet around strangers.
N1	I get stressed out easily.
N2	I am relaxed most of the time.
N3	I worry about things.
N4	I seldom feel blue.
N5	I am easily disturbed.
N6	I get upset easily.
N7	I change my mood a lot.
N8	I have frequent mood swings.
N9	I get irritated easily.
N10	I often feel blue.
A1	I feel little concern for others.
A2	I am interested in people.
A3	I insult people.
A4	I sympathize with others' feelings.
A5	I am not interested in other people's problems.
A6	I have a soft heart.
A7	I am not really interested in others.
A8	I take time out for others.
A9	I feel others' emotions.
A10	I make people feel at ease.
C1	I am always prepared.
C2	I leave my belongings around.
C3	I pay attention to details.
C4	I make a mess of things.
C5	I get chores done right away.
C6	I often forget to put things back in their proper place.
C7	I like order.
C8	I shirk my duties.
C9	I follow a schedule.
C10	I am exacting in my work.
O1	I have a rich vocabulary.
O2	I have difficulty understanding abstract ideas.
O3	I have a vivid imagination.
O4	I am not interested in abstract ideas.
O5	I have excellent ideas.
O6	I do not have a good imagination.
O7	I am quick to understand things.
O8	I use difficult words.
O9	I spend time reflecting on things.
O10	I am full of ideas.
'''


In [18]:
big5_questions_df = pd.DataFrame([item.split('\t') for item in questions.splitlines() if item>''])

---

### Questions and direction key

https://ipip.ori.org/new_ipip-50-item-scale.htm

In [19]:
factor_map = { 1: 'E', 
               2: 'A',
               3: 'C',
               4: 'N',
               5: 'O' }

In [20]:
ipip_df = pd.read_html('big5_questions.html', header=0)[0]
ipip_df = ipip_df.rename(columns={'Unnamed: 1': 'text', 'Unnamed: 7': 'factor_and_direction'})[['text','factor_and_direction']]
ipip_df[['factor','direction']]=ipip_df['factor_and_direction'].str.extract(r'([1-5])(.)')
ipip_df['category']=ipip_df['factor'].astype(int).map(factor_map)

FileNotFoundError: [Errno 2] No such file or directory: big5_questions.html

In [None]:
ipip_df = ipip_df.assign(number=np.repeat(np.arange(1,11),5))
ipip_df = ipip_df.assign(qcode=ipip_df['category'].str.cat(ipip_df['number'].astype(str)))   

In [None]:
neg_items = ipip_df.query('direction=="-"')['qcode']

* Make a copy of the original dataframe to keep available in case of mistakes

In [None]:
big5_scored_df = big5_df.copy()

* Reverse code the `negatively` keyed items

In [None]:
big5_scored_df[neg_items] = 6-big5_df[neg_items]

In [None]:
E_cols = [f'E{n+1}' for n in range(10)]
E_cols

In [None]:
cat_cols = {
    cat : [f'{cat}{n+1}' for n in range(10)] 
    for cat in ('O','C','E','A','N') 
}

In [None]:
for cat, cols in cat_cols.items():
    big5_scored_df[cat]=big5_scored_df[cols].sum(axis=1)

# Working

In [None]:
big5_scored_df

1. Correlation between extraversion and agreeableness?
2. Does a person's race affect their extraversion?
3. Does extraversion decrease with age?
4. Does high agreeableness impact emotional stability?