In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [3]:
df = pd.read_csv('../output_data/data_complete.csv')
df.head()

Unnamed: 0,gender,E1,E2,E3,E4,E5,E6,E7,E8,E9,...,O6,O7,O8,O9,O10,Extraversion,Neuroticism,Agreeableness,Conscientiousness,Openness
0,Male,4,2,5,2,5,1,4,3,5,...,1,4,2,5,5,14,1,22,23,25
1,Female,2,2,3,3,3,3,1,5,1,...,3,3,1,3,2,-8,-19,11,18,8
2,Female,5,1,1,4,5,1,1,5,5,...,1,5,5,5,5,5,-34,14,25,27
3,Female,2,5,2,4,3,4,3,4,4,...,2,5,2,5,5,-8,-31,13,2,23
4,Female,3,1,3,3,3,1,3,1,3,...,1,3,1,5,3,4,-18,20,10,16


# Gender Analysis

In [4]:
# Plot using a bar chart

gender_count = df['gender'].value_counts()

x = gender_count.index.values.tolist()
y = gender_count.values.tolist()

trace = go.Bar(x=x, y=y)

data = [trace]
layout = go.Layout(title='Gender',
                   xaxis={'title': 'Gender'},
                   yaxis={'title': 'Count'})

fig = go.Figure(data=data, layout=layout)
pyo.plot(fig, filename='gender.html')

'gender.html'

# Traits Analysis

In [5]:
filt_m = df['gender'] == 'Male'
filt_f = df['gender'] == 'Female'

### Extraversion

In [6]:
male = df.loc[filt_m, 'Extraversion']
female = df.loc[filt_f, 'Extraversion']

In [7]:
data = [go.Box(y=male, name='Male', boxpoints='outliers', jitter=0.3, pointpos=0), 
        go.Box(y=female, name='Female', boxpoints='outliers', jitter=0.3, pointpos=0)]

layout = go.Layout(title='Extraversion scores by gender',
                   xaxis={'title': 'Gender'},
                   yaxis={'title': 'Score'})

fig = go.Figure(data=data, layout=layout)
pyo.plot(fig, filename='Extraversion.html')

'Extraversion.html'

In [8]:
hist_data = [male, female]
group_labels = ['Male', 'Female']

fig = ff.create_distplot(hist_data, 
                         group_labels, 
                         bin_size=[1,1])

pyo.plot(fig, filename='distplot1.html')

'distplot1.html'

In [9]:
# Hypothesis testing

# H1: The two populations have different mean scores for Agreeableness

from scipy.stats import ttest_ind

(statistic, pvalue) = ttest_ind(male, female, equal_var=False)
print(pvalue)

3.139671090133996e-13


# Neuroticism

In [10]:
male = df.loc[filt_m, 'Neuroticism']
female = df.loc[filt_f, 'Neuroticism']

In [11]:
data = [go.Box(y=male, name='Male', boxpoints='outliers', jitter=0.3, pointpos=0), 
        go.Box(y=female, name='Female', boxpoints='outliers', jitter=0.3, pointpos=0)]

layout = go.Layout(title='Neuroticism',
                   xaxis={'title': 'Gender'},
                   yaxis={'title': 'Score'})

fig = go.Figure(data=data, layout=layout)
pyo.plot(fig, filename='Neuroticism.html')

'Neuroticism.html'

In [12]:
hist_data = [male, female]
group_labels = ['Male', 'Female']

fig = ff.create_distplot(hist_data, 
                         group_labels, 
                         bin_size=[1,1])

pyo.plot(fig, filename='distplot1.html')

'distplot1.html'

In [13]:
# Hypothesis testing

# H1: The two populations have different mean scores for Agreeableness

from scipy.stats import ttest_ind

(statistic, pvalue) = ttest_ind(male, female, equal_var=False)
print(pvalue)

1.6556685312882452e-121


# Agreeableness

In [14]:
male = df.loc[filt_m, 'Agreeableness']
female = df.loc[filt_f, 'Agreeableness']

In [15]:
data = [go.Box(y=male, name='Male', boxpoints='outliers', jitter=0.3, pointpos=0), 
        go.Box(y=female, name='Female', boxpoints='outliers', jitter=0.3, pointpos=0)]

layout = go.Layout(title='Agreeableness',
                   xaxis={'title': 'Gender'},
                   yaxis={'title': 'Score'})

fig = go.Figure(data=data, layout=layout)
pyo.plot(fig, filename='Agreeableness.html')

'Agreeableness.html'

In [16]:
male = df.loc[filt_m, 'Agreeableness']
female = df.loc[filt_f, 'Agreeableness']

hist_data = [male, female]
group_labels = ['Male', 'Female']

fig = ff.create_distplot(hist_data, 
                         group_labels, 
                         bin_size=[1,1])

pyo.plot(fig, filename='distplot1.html')

'distplot1.html'

In [17]:
# Hypothesis testing

# H1: The two populations have different mean scores for Agreeableness

from scipy.stats import ttest_ind

(statistic, pvalue) = ttest_ind(male, female, equal_var=False)
print(pvalue)

1.037992430785508e-192


# Conscientiousness

In [18]:
male = df.loc[filt_m, 'Conscientiousness']
female = df.loc[filt_f, 'Conscientiousness']

In [19]:
data = [go.Box(y=male, name='Male', boxpoints='outliers', jitter=0.3, pointpos=0), 
        go.Box(y=female, name='Female', boxpoints='outliers', jitter=0.3, pointpos=0)]

layout = go.Layout(title='Conscientiousness',
                   xaxis={'title': 'Gender'},
                   yaxis={'title': 'Score'})

fig = go.Figure(data=data, layout=layout)
pyo.plot(fig, filename='Conscientiousness.html')

'Conscientiousness.html'

In [20]:
hist_data = [male, female]
group_labels = ['Male', 'Female']

fig = ff.create_distplot(hist_data, 
                         group_labels, 
                         bin_size=[1,1])

pyo.plot(fig, filename='distplot1.html')

'distplot1.html'

In [21]:
# Hypothesis testing

# H1: The two populations have different mean scores for Agreeableness

from scipy.stats import ttest_ind

(statistic, pvalue) = ttest_ind(male, female, equal_var=False)
print(pvalue)

6.515902281646965e-05


# Openness

In [22]:
male = df.loc[filt_m, 'Openness']
female = df.loc[filt_f, 'Openness']

In [23]:
data = [go.Box(y=male, name='Male', boxpoints='outliers', jitter=0.3, pointpos=0), 
        go.Box(y=female, name='Female', boxpoints='outliers', jitter=0.3, pointpos=0)]

layout = go.Layout(title='Openness',
                   xaxis={'title': 'Gender'},
                   yaxis={'title': 'Score'})

fig = go.Figure(data=data, layout=layout)
pyo.plot(fig, filename='Openness.html')

'Openness.html'

In [24]:
male = df.loc[filt_m, 'Openness']
female = df.loc[filt_f, 'Openness']

hist_data = [male, female]
group_labels = ['Male', 'Female']

fig = ff.create_distplot(hist_data, 
                         group_labels, 
                         bin_size=[1,1])

pyo.plot(fig, filename='distplot1.html')

'distplot1.html'

In [25]:
# Hypothesis testing

# H1: The two populations have different mean scores for Agreeableness

from scipy.stats import ttest_ind

(statistic, pvalue) = ttest_ind(male, female, equal_var=False)
print(pvalue)

2.3809714812274794e-46
