In [1]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

## Analysis of the Environmental Factors that influence our model

#### First we will read the excel file containing the environmental datasheet.

#### taking only the reversed variables (factors) in the dataframe, they are needed for the analysis

In [33]:
env = pd.read_csv('./data/environment.csv', sep=';', header=0)

In [34]:
env[['GEN_FAS_computer', 'GEN_FAS_computer_R']][~pd.isna(env.GEN_FAS_computer_R)].head()

Unnamed: 0,GEN_FAS_computer,GEN_FAS_computer_R
20,3.0,0.0
21,3.0,0.0
22,3.0,0.0
23,3.0,0.0
52,2.0,1.0


In [4]:
env[['GEN_FAS_car', 'GEN_FAS_car_R']][~pd.isna(env.GEN_FAS_computer_R)].head()

Unnamed: 0,GEN_FAS_car,GEN_FAS_car_R
20,1.0,1.0
21,1.0,1.0
22,1.0,1.0
23,1.0,1.0
52,0.0,2.0


In [5]:
env[['GEN_FAS_vacation', 'GEN_FAS_vacation_R']][~pd.isna(env.GEN_FAS_computer_R)].head()

Unnamed: 0,GEN_FAS_vacation,GEN_FAS_vacation_R
20,3.0,0.0
21,3.0,0.0
22,3.0,0.0
23,3.0,0.0
52,1.0,2.0


In [6]:
env[['GEN_FAS_ownroom', 'GEN_FAS_ownroom_R']][~pd.isna(env.GEN_FAS_computer_R)].head()

Unnamed: 0,GEN_FAS_ownroom,GEN_FAS_ownroom_R
20,1.0,0.0
21,1.0,0.0
22,1.0,0.0
23,1.0,0.0
52,0.0,1.0


In [7]:
env = env[['Child_Bosse','School', 'Class','Wave','GEN_FAS_computer_R','GEN_FAS_car_R',
           'GEN_FAS_vacation_R','GEN_FAS_ownroom_R']]

In [8]:
env.head()

Unnamed: 0,Child_Bosse,School,Class,Wave,GEN_FAS_computer_R,GEN_FAS_car_R,GEN_FAS_vacation_R,GEN_FAS_ownroom_R
0,643,22,52.0,2,,,,
1,643,22,52.0,4,,,,
2,643,22,52.0,3,,,,
3,643,22,52.0,1,,,,
4,645,22,52.0,2,,,,


In [9]:
env.GEN_FAS_car_R.min(), env.GEN_FAS_car_R.max(), env.GEN_FAS_car_R.mean(), env.GEN_FAS_car_R.std()

(0.0, 2.0, 0.6118421052631579, 0.6645451822152039)

In [10]:
env.GEN_FAS_computer_R.min(), env.GEN_FAS_computer_R.max(), env.GEN_FAS_computer_R.mean(), env.GEN_FAS_computer_R.std()

(0.0, 3.0, 0.28289473684210525, 0.6007565575204298)

In [11]:
env.GEN_FAS_vacation_R.min(), env.GEN_FAS_vacation_R.max(), env.GEN_FAS_vacation_R.mean(), env.GEN_FAS_vacation_R.std()

(0.0, 3.0, 1.613486842105263, 1.0656519126826036)

In [12]:
env.GEN_FAS_ownroom_R.min(), env.GEN_FAS_ownroom_R.max(), env.GEN_FAS_ownroom_R.mean(), env.GEN_FAS_ownroom_R.std()

(0.0, 1.0, 0.0756578947368421, 0.26450433870684115)

In [13]:
classes=[67, 71, 72, 74, 77, 78, 79, 81, 83, 86, 100, 101, 103, 121, 122, 125, 126, 127, 129, 130, 131, 133, 135, 136, 138, 139]

#### take only the rows with the particular classes

In [14]:
env = env[env['Class'].isin(classes)]

In [15]:
env.head()

Unnamed: 0,Child_Bosse,School,Class,Wave,GEN_FAS_computer_R,GEN_FAS_car_R,GEN_FAS_vacation_R,GEN_FAS_ownroom_R
420,963,25,67.0,1,,,,
421,963,25,67.0,3,,,,
422,963,25,67.0,4,,,,
423,963,25,67.0,2,,,,
424,965,25,67.0,1,0.0,1.0,3.0,0.0


### looking at the missing values

In [16]:
env.isnull().sum()

Child_Bosse             0
School                  0
Class                   0
Wave                    0
GEN_FAS_computer_R    400
GEN_FAS_car_R         400
GEN_FAS_vacation_R    400
GEN_FAS_ownroom_R     400
dtype: int64

In [17]:
env.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1840 entries, 420 to 3707
Data columns (total 8 columns):
Child_Bosse           1840 non-null int64
School                1840 non-null int64
Class                 1840 non-null float64
Wave                  1840 non-null int64
GEN_FAS_computer_R    1440 non-null float64
GEN_FAS_car_R         1440 non-null float64
GEN_FAS_vacation_R    1440 non-null float64
GEN_FAS_ownroom_R     1440 non-null float64
dtypes: float64(5), int64(3)
memory usage: 129.4 KB


### missing values in percentage

In [18]:
(1 - (len(env)-env.isnull().sum()) / len(env))*100

Child_Bosse            0.00000
School                 0.00000
Class                  0.00000
Wave                   0.00000
GEN_FAS_computer_R    21.73913
GEN_FAS_car_R         21.73913
GEN_FAS_vacation_R    21.73913
GEN_FAS_ownroom_R     21.73913
dtype: float64

### number of kids that have no data input, at all.

In [19]:
all_missing_e = env

In [20]:
all_missing_e = all_missing_e.groupby('Child_Bosse')

In [21]:
df_help_missing_e = all_missing_e.sum()

In [22]:
df_help_missing_e

Unnamed: 0_level_0,School,Class,Wave,GEN_FAS_computer_R,GEN_FAS_car_R,GEN_FAS_vacation_R,GEN_FAS_ownroom_R
Child_Bosse,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
963,100,268.0,10,0.0,0.0,0.0,0.0
965,100,268.0,10,0.0,4.0,12.0,0.0
966,100,268.0,10,0.0,0.0,0.0,0.0
967,100,268.0,10,0.0,4.0,4.0,0.0
968,100,268.0,10,0.0,4.0,12.0,0.0
971,100,268.0,10,0.0,4.0,4.0,0.0
972,100,268.0,10,4.0,0.0,4.0,0.0
973,100,268.0,10,0.0,0.0,0.0,0.0
974,100,268.0,10,0.0,0.0,0.0,0.0
975,100,268.0,10,0.0,8.0,12.0,0.0


In [23]:
df_help_missing_e=df_help_missing_e.drop(['School', 'Class','Wave'], axis=1)

In [24]:
df_help_missing_e

Unnamed: 0_level_0,GEN_FAS_computer_R,GEN_FAS_car_R,GEN_FAS_vacation_R,GEN_FAS_ownroom_R
Child_Bosse,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
963,0.0,0.0,0.0,0.0
965,0.0,4.0,12.0,0.0
966,0.0,0.0,0.0,0.0
967,0.0,4.0,4.0,0.0
968,0.0,4.0,12.0,0.0
971,0.0,4.0,4.0,0.0
972,4.0,0.0,4.0,0.0
973,0.0,0.0,0.0,0.0
974,0.0,0.0,0.0,0.0
975,0.0,8.0,12.0,0.0


In [25]:
df_only_missing=df_help_missing_e[df_help_missing_e.isnull().sum(1)==4]

In [26]:
df_only_missing

Unnamed: 0_level_0,GEN_FAS_computer_R,GEN_FAS_car_R,GEN_FAS_vacation_R,GEN_FAS_ownroom_R
Child_Bosse,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [27]:
(1-(df_help_missing_e.shape[0]-df_only_missing.shape[0]) / df_help_missing_e.shape[0]) * 100

0.0

In [28]:
df_help_missing_e.shape[0]

460

In [29]:
df_only_missing.shape[0]

0

### conclusion: we have no data whatsoever for 100 out of 460 kids. 21.73% of completely missing data

In [30]:
missing_users_index=df_only_missing.index.values

In [31]:
missing_users_index

array([], dtype=int64)

In [32]:
users_with_data_e = e[~e['Child_Bosse'].isin(missing_users_index.tolist())]

NameError: name 'e' is not defined

In [None]:
users_with_data_e

### we got the kids that have some data in the 'users_with_data_e' dataframe!

### Step1 let's check if there is difference in answers per kid, comparing the 4 waves

In [None]:
index_kids=users_with_data_e.Child_Bosse.unique().tolist()
len(index_kids)

we are working with 360 kids total

This 'for' cycle checks if there is a difference at waves, for each user. We want to check if the column values for the environmental factors changed or not.

In [None]:
user_list_same_waves_val = [] 
user_list_dif_waves_val = [] 
for user in index_kids:
    user_df=users_with_data_e[users_with_data_e.Child_Bosse == user]
    if(user_df.GEN_FAS_computer_R.nunique()==1 and user_df.GEN_FAS_car_R.nunique()==1 and user_df.GEN_FAS_vacation_R.nunique()==1 and user_df.GEN_FAS_ownroom_R.nunique()==1):
        user_list_same_waves_val.append(user)
    else:
        user_list_diff_waves_val.append(user)

In [None]:
len(user_list_same_waves_val)

In [None]:
len(user_list_dif_waves_val)

### all our users have same inputs in every wave, we can simply take wave one for all of them for the analysis!

In [None]:
final_e=users_with_data_e[users_with_data_e.Wave==1]

In [None]:
final_e.shape[0]

In [None]:
final_e

In [None]:
final_e.min(axis=0)

In [None]:
final_e.max(axis=0)

**min max ranges for the environmental factors of interest**

**GEN_FAS_computer_R **      0,3

**GEN_FAS_car_R **           0,2

**GEN_FAS_vacation_R**       0,3

**GEN_FAS_ownroom_R**        0,1


In [None]:
col_list= list(['GEN_FAS_computer_R',
       'GEN_FAS_car_R', 'GEN_FAS_vacation_R',
       'GEN_FAS_ownroom_R'])

Append new FAS_Score column to the final_e dataframe, summing up the selected columns per row.

In [None]:
#final_e['FAS_Score']=final_e[col_list].sum(axis=1)

In [None]:
#final_e

In [None]:
#final_e=final_e[['Child_Bosse','FAS_Score']]

In [None]:
#final_e.set_index('Child_Bosse')

In [None]:
#final_e.FAS_Score.astype('int')

In [None]:
#final_e.FAS_Score.hist(bins=12,figsize=((16,8)))

In [None]:
#final_e.FAS_Score.describe()

In [None]:
ncol=final_e[col_list].sum(axis=1).to_frame()

In [None]:
ncol.columns=['FAS_Score_R']

In [None]:
ncol

In [None]:
final_e = pd.concat([final_e, ncol], axis=1)

In [None]:
 final_e=final_e[['Child_Bosse','FAS_Score_R']]

In [None]:
final_e