# Imports

In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import sys

In [3]:
pd.set_option("display.precision", 3)

In [4]:
sys.path.append('.')

In [5]:
import tools as dfg_tools

In [6]:
from importlib import reload
dfg_tools = reload(dfg_tools)

# Load data

In [7]:
eleves = pd.read_csv('../data/DIM_ELEVE.csv', parse_dates=['DATE_NAISSANCE'])

In [8]:
print(eleves.columns)
print(eleves.shape)
print(eleves.GENRE.unique())

Index(['ID_ELEVE', 'GENRE', 'DATE_NAISSANCE'], dtype='object')
(10000, 3)
['f' 'm']


In [9]:
reponses = pd.read_csv('../data/FACT_REPONSE.csv', parse_dates=['DATE_TIME'])

In [10]:
print(reponses.columns)
print(reponses.shape)

Index(['ID_LOG', 'KEY_REPONSE', 'ID_REPONDANT', 'DATE_TIME'], dtype='object')
(100000, 4)


# Data preparation

In [11]:
merged = reponses.merge(
    eleves, left_on = "ID_REPONDANT", right_on = "ID_ELEVE"
    )

In [12]:
merged['age_group'] = dfg_tools.discretize_age(merged)

# Comparative data analysis

## Cyberhate exposure - `M1_2` *vs.* `B_2_1_d`

In [13]:
response = 'B_2_1_d'

### Marginal exposure (Percentage of respondents)

In [14]:
n_repondant = merged.ID_REPONDANT.unique().shape[0]

In [15]:
tot_pct = reponses.KEY_REPONSE.value_counts()['B_2_1_d'] / n_repondant
tot_pct

0.1298

In [16]:
std_tot_pct = np.sqrt((tot_pct * (1 - tot_pct)) / n_repondant)
std_tot_pct

0.004752935093181896

In [17]:
ll, ul = tot_pct - 1.96 * std_tot_pct, tot_pct + 1.96 * std_tot_pct

In [18]:
print(f"{round(tot_pct*100, 3)} [{round(ll*100, 3)}, {round(ul*100, 3)}]")

12.98 [12.048, 13.912]


In [19]:
import matplotlib.pyplot as plt

In [20]:
import seaborn as sns

In [21]:
sns.set()

In [22]:
# plt.scatter(tot_pct, 1)
# plt.scatter(0.24, 1)
# plt.errorbar(tot_pct, 1, xerr=[(std_tot_pct,), (std_tot_pct,)])
# plt.title("EUKO 2020 vs. STOPCyber")

### Stratified analysis

#### By gender

In [23]:
hate_per_gender = dfg_tools.estimate_proportion(
    merged,
    [response, 'B_2_1_c'],
    ['GENRE'], variance_est = 'clt'
)
hate_per_gender

Unnamed: 0_level_0,phat,std,LL,UL
GENRE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f,22.245,0.842,20.595,23.895
m,22.118,0.82,20.51,23.726


In [24]:
# plt.scatter(
#     hate_per_gender.phat, [0.25, 0.75],
#     c = ["darkblue", "salmon"], s = 100, zorder = 10
#     )

# plt.errorbar(
#     hate_per_gender.phat,
#     [.25, .75],
#     color = "black",
#     xerr=[hate_per_gender['std']*1.96]*2,
#     linestyle = 'none'
# )

# plt.title("EUKO 2020 vs. STOPCyber")

#### By age

In [25]:
hate_per_age = dfg_tools.estimate_proportion(
    merged,
    response,
    ['age_group'],
    variance_est = 'bootstrap'
).loc[['11-12', '13-14', '15-17']]
hate_per_age

  return data \
  return data \


Unnamed: 0_level_0,phat,std,LL,UL
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11-12,12.313,0.948,10.554,14.248
13-14,11.06,1.003,9.124,13.18
15-17,11.797,0.994,9.862,13.825


In [26]:
# y = [i*0.5*(1/3) for i in range(3)]
# plt.scatter(
#     hate_per_age.phat, y,
#     c = ["darkblue", "salmon", "chartreuse"], s = 100,
#     zorder = 10
#     )

# plt.errorbar(
#     hate_per_age.phat,
#     y,
#     color = "black",
#     xerr=[hate_per_age['std']*1.96]*2,
#     linestyle = 'none', capsize = 4
# )

# plt.title("EUKO 2020 vs. STOPCyber")
# plt.yticks(y,
#            hate_per_age.index)
# # plt.ylabel([])
# plt.grid([])

#### By gender and age

In [27]:
tmp = merged.copy()

# discretize age with different bins
bins =  [0, 11, 15, 17, 99]
labels = ['0-11', '11-14', '15-17', '>17']

tmp['age_group'] = dfg_tools.discretize_age(
    tmp, bins = bins, labels = labels
)

In [28]:
dfg_tools.estimate_proportion(
    tmp,
    response,
    ['age_group', 'GENRE'], variance_est = "bootstrap"
).loc[['11-14', '15-17']]

  return data \
  return data \


Unnamed: 0_level_0,Unnamed: 1_level_0,phat,std,LL,UL
age_group,GENRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11-14,f,12.071,0.999,10.211,14.02
11-14,m,11.354,0.918,9.605,13.275
15-17,f,11.324,1.373,8.637,14.203
15-17,m,12.234,1.357,9.752,14.894


## Cyberhate victimization

Definition cyberhate victimization: Avoir repondu au moins un des items suivants:

- `C_2_1_a`
- `C_2_1_b`
- `C_2_1_d`
- `C_2_1_e`

In [29]:
responses = [
    'C_2_1_a',
    'C_2_1_b',
    'C_2_1_d',
    'C_2_1_e' 
]

### Marginal proportion

In [30]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0]

1871

In [31]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0] / n_repondant

0.3742

### Stratified analysis

In [32]:
tmp = merged.copy()

# discretize age with different bins
bins =  [0, 11, 15, 17, 99]
labels = ['0-11', '11-14', '15-17', '>17']

tmp['age_group'] = dfg_tools.discretize_age(
    tmp, bins = bins, labels = labels
)

#### By gender

In [33]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ["GENRE"]
)

Unnamed: 0_level_0,phat,std,LL,UL
GENRE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f,39.041,0.987,37.106,40.977
m,35.873,0.948,34.015,37.732


### By age

In [34]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ['age_group']
).loc[['11-14', '15-17']]

  return data \
  return data \


Unnamed: 0_level_0,phat,std,LL,UL
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11-14,37.714,1.028,35.699,39.729
15-17,35.3,1.451,32.456,38.143


## Cyberaggression

Definition cyberhate victimization: Avoir repondu au moins un des items suivants:

- `C_1_2_a`
- `C_1_2_b`

In [35]:
responses = [
    'C_1_2_a',
    'C_1_2_b' 
]

### Marginal proportion

In [36]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0]

1048

In [37]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0] / n_repondant

0.2096

### Stratified analysis

In [38]:
tmp = merged.copy()

# discretize age with different bins
bins =  [0, 11, 15, 17, 99]
labels = ['0-11', '11-14', '15-17', '>17']

tmp['age_group'] = dfg_tools.discretize_age(
    tmp, bins = bins, labels = labels
)

#### By gender

In [39]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ["GENRE"]
)

Unnamed: 0_level_0,phat,std,LL,UL
GENRE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f,21.59,0.833,19.957,23.222
m,20.36,0.796,18.799,21.92


### By age

In [40]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ['age_group']
).loc[['11-14', '15-17']]

  return data \
  return data \


Unnamed: 0_level_0,phat,std,LL,UL
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11-14,20.612,0.858,18.93,22.294
15-17,22.396,1.266,19.916,24.877
