# Imports

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import sys

In [2]:
sys.path.append('.')

In [3]:
import tools as dfg_tools

In [4]:
from importlib import reload
dfg_tools = reload(dfg_tools)

# Load data

In [5]:
eleves = pd.read_csv('../data/DIM_ELEVE.csv', parse_dates=['DATE_NAISSANCE'])

In [6]:
print(eleves.columns)
print(eleves.shape)
print(eleves.GENRE.unique())

Index(['ID_ELEVE', 'GENRE', 'DATE_NAISSANCE'], dtype='object')
(10000, 3)
['m' 'f']


In [7]:
reponses = pd.read_csv('../data/FACT_REPONSE.csv', parse_dates=['DATE_TIME'])

In [8]:
print(reponses.columns)
print(reponses.shape)

Index(['ID_LOG', 'KEY_REPONSE', 'ID_REPONDANT', 'DATE_TIME'], dtype='object')
(100000, 4)


# Data preparation

In [9]:
merged = reponses.merge(
    eleves, left_on = "ID_REPONDANT", right_on = "ID_ELEVE"
    )

In [10]:
merged['age_group'] = dfg_tools.discretize_age(merged)

# Comparative data analysis

## Cyberhate exposure - `M1_2` *vs.* `B_2_1_d`

In [11]:
response = 'B_2_1_d'

### Marginal exposure (Percentage of respondents)

In [12]:
n_repondant = merged.ID_REPONDANT.unique().shape[0]

In [13]:
tot_pct = reponses.KEY_REPONSE.value_counts()['B_2_1_d'] / n_repondant
tot_pct

0.1286

### Stratified analysis

#### By gender

In [28]:
dfg_tools.estimate_proportion(
    merged,
    [response, 'B_2_1_c'],
    ['GENRE']
)

Unnamed: 0_level_0,phat,std,LL,UL
GENRE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f,22.981117,0.84328,21.328288,24.633945
m,23.178017,0.842088,21.527524,24.828509


#### By age

In [17]:
dfg_tools.estimate_proportion(
    merged,
    response,
    ['age_group']
).loc[['11-12', '13-14', '15-17']]

  return data \
  return data \


Unnamed: 0_level_0,phat,std,LL,UL
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11-12,12.301957,1.002726,10.336615,14.2673
13-14,12.642669,0.984707,10.712643,14.572696
15-17,12.197309,0.980051,10.27641,14.118209


#### By gender and age

In [18]:
tmp = merged.copy()

# discretize age with different bins
bins =  [0, 11, 15, 17, 99]
labels = ['0-11', '11-14', '15-17', '>17']

tmp['age_group'] = dfg_tools.discretize_age(
    tmp, bins = bins, labels = labels
)

In [19]:
dfg_tools.estimate_proportion(
    tmp,
    response,
    ['age_group', 'GENRE']
).loc[['11-14', '15-17']]

  return data \
  return data \


Unnamed: 0_level_0,Unnamed: 1_level_0,phat,std,LL,UL
age_group,GENRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11-14,f,13.416816,1.019344,11.418902,15.414729
11-14,m,11.517367,0.965155,9.625664,13.409071
15-17,f,13.120567,1.421658,10.334117,15.907018
15-17,m,11.252269,1.346241,8.613636,13.890901


## Cyberhate victimization

Definition cyberhate victimization: Avoir repondu au moins un des items suivants:

- `C_2_1_a`
- `C_2_1_b`
- `C_2_1_d`
- `C_2_1_e`

In [20]:
responses = [
    'C_2_1_a',
    'C_2_1_b',
    'C_2_1_d',
    'C_2_1_e' 
]

### Marginal proportion

In [21]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0]

1932

In [22]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0] / n_repondant

0.3864

### Stratified analysis

In [23]:
tmp = merged.copy()

# discretize age with different bins
bins =  [0, 11, 15, 17, 99]
labels = ['0-11', '11-14', '15-17', '>17']

tmp['age_group'] = dfg_tools.discretize_age(
    tmp, bins = bins, labels = labels
)

#### By gender

In [24]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ["GENRE"]
)

Unnamed: 0_level_0,phat,std,LL,UL
GENRE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f,38.047409,0.97315,36.140034,39.954783
m,39.227399,0.974373,37.317628,41.137171


### By age

In [27]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ['age_group']
).loc[['11-14', '15-17']]

  return data \
  return data \


Unnamed: 0_level_0,phat,std,LL,UL
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11-14,38.426763,1.034238,36.399657,40.453869
15-17,39.192825,1.461986,36.327332,42.058318


## Cyberaggression

Definition cyberhate victimization: Avoir repondu au moins un des items suivants:

- `C_1_2_a`
- `C_1_2_b`

In [29]:
responses = [
    'C_1_2_a',
    'C_1_2_b' 
]

### Marginal proportion

In [30]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0]

0

In [31]:
merged.query("KEY_REPONSE in @responses").ID_REPONDANT.unique().shape[0] / n_repondant

0.0

### Stratified analysis

In [32]:
tmp = merged.copy()

# discretize age with different bins
bins =  [0, 11, 15, 17, 99]
labels = ['0-11', '11-14', '15-17', '>17']

tmp['age_group'] = dfg_tools.discretize_age(
    tmp, bins = bins, labels = labels
)

#### By gender

In [33]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ["GENRE"]
)

Unnamed: 0_level_0,phat,std,LL,UL
GENRE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
f,,,,
m,,,,


### By age

In [None]:
dfg_tools.estimate_proportion(
    tmp,
    responses,
    ['age_group']
).loc[['11-14', '15-17']]

  return data \
  return data \


Unnamed: 0_level_0,phat,std,LL,UL
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11-14,38.426763,1.034238,36.399657,40.453869
15-17,39.192825,1.461986,36.327332,42.058318
