# Imports

In [1]:
import pandas as pd
import sys
from pathlib import Path
import plotly.express as px
import os

# Ajoute le dossier parent à sys.path
chemin_dossier_parent = Path(os.getcwd()).parent
sys.path.append(str(chemin_dossier_parent))
from my_data.db_connect import get_session
from my_data.datasets import get_environment_data, get_lichen_data, get_lichen_species_data, get_tree_data, get_observation_data, get_table_data

In [2]:
session = get_session()

# Récupération des datasets
environment_df = get_environment_data()
lichen_df = get_lichen_data()
lichen_species_df = get_lichen_species_data()
observation_df = get_observation_data()
table_df = get_table_data()
tree_df = get_tree_data()

# Exploratory Data Analysis

In [3]:
environment_df

Unnamed: 0,id,name,name_en,name_fr
0,1,Parc,,Parc


In [4]:
def summary(df):
    summry = pd.DataFrame(df.dtypes, columns=['data type'])
    summry['#missing'] = df.isnull().sum().values
    summry['Duplicate'] = df.duplicated().sum()
    summry['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summry['min'] = desc['min'].values
    summry['max'] = desc['max'].values
    summry['avg'] = desc['mean'].values
    summry['std dev'] = desc['std'].values
    summry['top value'] = desc['top'].values
    summry['Freq'] = desc['freq'].values

    return summry

## Lichen

In [5]:
lichen_df

Unnamed: 0,id,species_id,picture,certitude,observation_id
0,4753,17,,False,472
1,4754,24,,False,472
2,4755,15,,False,472
3,4756,20,,False,472
4,4757,20,,False,472
...,...,...,...,...,...
1959,4748,10,,False,472
1960,4749,12,,False,472
1961,4750,13,,False,472
1962,4751,1,,False,472


In [6]:
summary(lichen_df).style.\
background_gradient(cmap='Pastel2_r', axis=0). \
set_properties(**{'border': '1.3px dotted', 'color': '', 'caption-side': 'left'})

Unnamed: 0,data type,#missing,Duplicate,#unique,min,max,avg,std dev,top value,Freq
id,int64,0,0,1964,3930.0,5893.0,4911.5,567.102284,,
species_id,int64,0,0,38,1.0,38.0,13.529022,8.791678,,
picture,object,0,0,1,,,,,,1964.0
certitude,bool,0,0,1,,,,,False,1964.0
observation_id,int64,0,0,203,408.0,610.0,493.422098,56.054214,,


## Lichen species

In [7]:
lichen_species_df

Unnamed: 0,id,name,unique,name_en,name_fr
0,1,Xanthoria parietina,True,,Xanthoria parietina
1,2,Candelaria concolor,True,,Candelaria concolor
2,3,Flavoparmelia caperata/soredians,True,,Flavoparmelia caperata/soredians
3,4,Hypotrachyna afrorevoluta/revoluta,True,,Hypotrachyna afrorevoluta/revoluta
4,5,Lichen crustacé à aspect poudreux,True,,Lichen crustacé à aspect poudreux
5,6,Ramalina farinacea,True,,Ramalina farinacea
6,7,Amandinea punctata/Lecidella elaeochroma,True,,Amandinea punctata/Lecidella elaeochroma
7,8,Evernia prunastri,True,,Evernia prunastri
8,9,Lecanora sp.,True,,Lecanora sp.
9,10,Melanelixia glabratula/Melanohalea exasperatula,True,,Melanelixia glabratula/Melanohalea exasperatula


In [8]:
summary(lichen_species_df).style.\
background_gradient(cmap='Pastel2_r', axis=0). \
set_properties(**{'border': '1.3px dotted', 'color': '', 'caption-side': 'left'})

Unnamed: 0,data type,#missing,Duplicate,#unique,min,max,avg,std dev,top value,Freq
id,int64,0,0,38,1.0,38.0,19.5,11.113055,,
name,object,0,0,38,,,,,Xanthoria parietina,1.0
unique,bool,0,0,2,,,,,True,35.0
name_en,object,38,0,0,,,,,,
name_fr,object,0,0,38,,,,,Xanthoria parietina,1.0


## Osbervation

In [9]:
observation_df

Unnamed: 0,id,date_obs,weather_cond,school_obs,localisation_lat,localisation_long,comment,user_id,validation,env_type_link_id
0,473,2022-03-22,,False,45.779110,3.012887,,3,invalidable,
1,474,2020-09-02,,False,45.817643,1.252730,,4,invalidable,
2,475,2022-03-09,,False,45.822240,1.228763,,2,invalidable,
3,476,2022-01-20,,False,45.822677,1.242670,,2,invalidable,
4,477,2022-01-20,,False,45.822843,1.241103,,2,invalidable,
...,...,...,...,...,...,...,...,...,...,...
198,468,2020-02-26,,False,45.763293,3.114170,,3,invalidable,
199,469,2021-08-17,,False,45.765070,3.041980,,3,invalidable,
200,470,2022-11-24,,False,45.769623,3.115553,,3,invalidable,
201,471,2021-07-27,,False,45.769787,3.136480,,3,invalidable,


In [10]:
summary(observation_df).style.\
background_gradient(cmap='Pastel2_r', axis=0). \
set_properties(**{'border': '1.3px dotted', 'color': '', 'caption-side': 'left'})

Unnamed: 0,data type,#missing,Duplicate,#unique,min,max,avg,std dev,top value,Freq
id,int64,0,0,203,408.0,610.0,509.0,58.745213,,
date_obs,object,0,0,137,,,,,2023-02-17,8.0
weather_cond,object,0,0,2,,,,,,202.0
school_obs,bool,0,0,1,,,,,False,203.0
localisation_lat,float64,0,0,203,43.00665,51.00926,47.316397,1.783083,,
localisation_long,float64,0,0,203,-1.649343,7.063973,2.305958,1.451067,,
comment,object,202,0,1,,,,,,1.0
user_id,int64,0,0,5,2.0,6.0,2.768473,0.980297,,
validation,object,0,0,1,,,,,invalidable,203.0
env_type_link_id,float64,202,0,1,1.0,1.0,1.0,,,


In [11]:
observation_df.weather_cond.unique()

array(['', 'SUN'], dtype=object)

## Tree

In [12]:
tree_df

Unnamed: 0,id,species_name,circonference,observation_id
0,1250,Frêne (Fraxinus sp.),144,417
1,1222,Peuplier noir (Populus nigra),160,408
2,1262,Érable (Acer sp.),118,421
3,1241,Tilleul (Tilia sp.),84,414
4,1246,Érable (Acer sp.),79,416
...,...,...,...,...
604,1825,Tilleul (Tilia sp.),40,609
605,1827,Tilleul (Tilia sp.),42,609
606,1828,Je ne sais pas,120,610
607,1829,Je ne sais pas,245,610


In [13]:
pd.DataFrame(tree_df['species_name'].unique())

Unnamed: 0,0
0,Frêne (Fraxinus sp.)
1,Peuplier noir (Populus nigra)
2,Érable (Acer sp.)
3,Tilleul (Tilia sp.)
4,Érable plane (Acer platanoides)
5,Érable argenté (Acer saccharinum)
6,Tilleul à grandes feuilles (Tilia platyphyllos)
7,Érable sycomore (Acer pseudoplatanus)
8,Févier d'Amérique (Gleditsia triacanthos)
9,Peuplier (Populus sp.)


**Warning : 'Je ne sais pas'**

In [14]:
summary(tree_df).style.\
background_gradient(cmap='Pastel2_r', axis=0). \
set_properties(**{'border': '1.3px dotted', 'color': '', 'caption-side': 'left'})

Unnamed: 0,data type,#missing,Duplicate,#unique,min,max,avg,std dev,top value,Freq
id,int64,0,0,609,1222.0,1830.0,1526.0,175.947435,,
species_name,object,0,0,47,,,,,Je ne sais pas,125.0
circonference,int64,0,0,145,0.0,450.0,94.6289,47.39726,,
observation_id,int64,0,0,203,408.0,610.0,509.0,58.648513,,


## Table

In [15]:
table_df

Unnamed: 0,id,sq1,sq2,sq3,sq4,sq5,lichen_id,tree_id
0,11789,[],[],[],[],[],3931,1223
1,11790,[],[],[],[],[],3931,1224
2,11800,"[E, S]","[E, S]","[E, S]","[E, S]","[E, S]",3935,1225
3,11785,[E],"[E, N]","[E, N]",[E],[E],3930,1222
4,11795,"[N, O]","[N, O]","[N, O]","[N, O]","[N, O]",3933,1226
...,...,...,...,...,...,...,...,...
5887,17674,[],[],[N],[],[],5893,1828
5888,17671,"[E, N, O, S]","[E, N, O, S]","[E, N, O, S]","[E, N, O]","[E, N, O]",5892,1828
5889,17662,"[E, N, O, S]","[E, N, O, S]","[E, N, O]","[E, N, O, S]","[E, N]",5889,1828
5890,17667,"[E, N, O, S]","[E, N, O]","[E, N, O, S]","[E, N, S]","[E, N, S]",5890,1830
