# Imports

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import sys

In [2]:
pd.set_option("display.precision", 3)

In [3]:
sys.path.append('.')

In [4]:
import tools as dfg_tools

In [5]:
from importlib import reload
dfg_tools = reload(dfg_tools)

# Load data

In [6]:
eleves = pd.read_csv('../data/DIM_ELEVE.csv')

In [7]:
print(eleves.columns)
print(eleves.shape)
print(eleves.GENRE.unique())

Index(['ID_ELEVE', 'GENRE', 'AGE', 'NIVEAU', 'ID_PARENT'], dtype='object')
(10000, 5)
['m' 'f']


In [8]:
reponses = pd.read_csv('../data/FACT_REPONSE.csv', parse_dates=['DATE_TIME'])

In [9]:
print(reponses.columns)
print(reponses.shape)

Index(['ID_LOG', 'KEY_REPONSE', 'ID_REPONDANT', 'DATE_TIME'], dtype='object')
(100000, 4)


# Data preparation

In [10]:
merged = reponses.merge(
    eleves, left_on = "ID_REPONDANT", right_on = "ID_ELEVE"
    )

In [11]:
merged['age_group'] = dfg_tools.discretize_age(merged)

# Comparative data analysis

## Cyberhate exposure - `M1_2` *vs.* `B_2_1_d`

In [12]:
response = 'B_2_1_d'

### Marginal exposure (Percentage of respondents)

In [13]:
n_repondant = merged.ID_REPONDANT.unique().shape[0]

In [14]:
tot_pct = reponses.KEY_REPONSE.value_counts()['B_2_1_d'] / n_repondant
tot_pct

0.1222

In [15]:
std_tot_pct = np.sqrt((tot_pct * (1 - tot_pct)) / n_repondant)
std_tot_pct

0.004631784969102085

In [16]:
ll, ul = tot_pct - 1.96 * std_tot_pct, tot_pct + 1.96 * std_tot_pct

In [17]:
print(f"{round(tot_pct*100, 3)} [{round(ll*100, 3)}, {round(ul*100, 3)}]")

12.22 [11.312, 13.128]


In [18]:
import matplotlib.pyplot as plt

In [19]:
import seaborn as sns

In [20]:
sns.set()

In [21]:
# plt.scatter(tot_pct, 1)
# plt.scatter(0.24, 1)
# plt.errorbar(tot_pct, 1, xerr=[(std_tot_pct,), (std_tot_pct,)])
# plt.title("EUKO 2020 vs. STOPCyber")

### Stratified analysis

#### By gender

In [22]:
hate_per_gender = dfg_tools.estimate_proportion(
    merged,
    [response, 'B_2_1_c'],
    ['GENRE'], variance_est = 'clt'
)
hate_per_gender

Unnamed: 0_level_0,phat,std,LL,UL
GENRE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f,23.047,0.834,21.411,24.682
m,21.239,0.826,19.621,22.858


In [23]:
# plt.scatter(
#     hate_per_gender.phat, [0.25, 0.75],
#     c = ["darkblue", "salmon"], s = 100, zorder = 10
#     )

# plt.errorbar(
#     hate_per_gender.phat,
#     [.25, .75],
#     color = "black",
#     xerr=[hate_per_gender['std']*1.96]*2,
#     linestyle = 'none'
# )

# plt.title("EUKO 2020 vs. STOPCyber")

#### By age

In [24]:
hate_per_age = dfg_tools.estimate_proportion(
    merged,
    response,
    ['age_group'],
    variance_est = 'bootstrap'
).loc[['11-12', '13-14', '15-17']]
hate_per_age

  return data \
  return data \


Unnamed: 0_level_0,phat,std,LL,UL
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11-12,11.245,0.885,9.558,12.932
13-14,12.15,0.882,10.472,13.909
15-17,11.146,1.205,8.948,13.501


In [25]:
# y = [i*0.5*(1/3) for i in range(3)]
# plt.scatter(
#     hate_per_age.phat, y,
#     c = ["darkblue", "salmon", "chartreuse"], s = 100,
#     zorder = 10
#     )

# plt.errorbar(
#     hate_per_age.phat,
#     y,
#     color = "black",
#     xerr=[hate_per_age['std']*1.96]*2,
#     linestyle = 'none', capsize = 4
# )

# plt.title("EUKO 2020 vs. STOPCyber")
# plt.yticks(y,
#            hate_per_age.index)
# # plt.ylabel([])
# plt.grid([])

#### By gender and age

In [26]:
tmp = merged.copy()

# discretize age with different bins
bins =  [0, 11, 15, 17, 99]
labels = ['0-11', '11-14', '15-17', '>17']

tmp['age_group'] = dfg_tools.discretize_age(
    tmp, bins = bins, labels = labels
)

In [27]:
dfg_tools.estimate_proportion(
    tmp,
    response,
    ['age_group', 'GENRE'], variance_est = "bootstrap"
).loc[['11-14', '15-17']]

  return data \
  return data \


Unnamed: 0_level_0,Unnamed: 1_level_0,phat,std,LL,UL
age_group,GENRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11-14,f,11.937,0.91,10.198,13.755
11-14,m,11.454,0.927,9.667,13.241
15-17,f,13.084,1.997,9.338,17.134
15-17,m,9.177,1.606,6.013,12.342


## Cyberhate victimization

Definition cyberhate victimization: Avoir repondu au moins un des items suivants:

- `C_2_1_a`
- `C_2_1_b`
- `C_2_1_d`
- `C_2_1_e`

In [28]:
responses = [
    'C_2_1_a',
    'C_2_1_b',
    'C_2_1_d',
    'C_2_1_e' 
]

### Marginal proportion

In [29]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0]

1952

In [30]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0] / n_repondant

0.3904

### Stratified analysis

In [31]:
tmp = merged.copy()

# discretize age with different bins
bins =  [0, 11, 15, 17, 99]
labels = ['0-11', '11-14', '15-17', '>17']

tmp['age_group'] = dfg_tools.discretize_age(
    tmp, bins = bins, labels = labels
)

#### By gender

In [32]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ["GENRE"]
)

Unnamed: 0_level_0,phat,std,LL,UL
GENRE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f,38.045,0.962,36.159,39.93
m,40.073,0.989,38.134,42.013


### By age

In [33]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ['age_group']
).loc[['11-14', '15-17']]

  return data \
  return data \


Unnamed: 0_level_0,phat,std,LL,UL
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11-14,39.503,0.978,37.585,41.421
15-17,40.502,1.945,36.69,44.315


## Cyberaggression

Definition cyberhate victimization: Avoir repondu au moins un des items suivants:

- `C_1_2_a`
- `C_1_2_b`

In [34]:
responses = [
    'C_1_2_a',
    'C_1_2_b' 
]

### Marginal proportion

In [35]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0]

1041

In [36]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0] / n_repondant

0.2082

### Stratified analysis

In [37]:
tmp = merged.copy()

# discretize age with different bins
bins =  [0, 11, 15, 17, 99]
labels = ['0-11', '11-14', '15-17', '>17']

tmp['age_group'] = dfg_tools.discretize_age(
    tmp, bins = bins, labels = labels
)

#### By gender

In [38]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ["GENRE"]
)

Unnamed: 0_level_0,phat,std,LL,UL
GENRE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f,20.927,0.806,19.347,22.506
m,20.709,0.818,19.106,22.313


### By age

In [39]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ['age_group']
).loc[['11-14', '15-17']]

  return data \
  return data \


Unnamed: 0_level_0,phat,std,LL,UL
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11-14,20.553,0.809,18.968,22.138
15-17,20.251,1.592,17.13,23.372
