# Analyse et visualisation des données

## Initialisation

In [None]:
# std
import json
import os
import re
# external
import numpy as np
import pandas as pd
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
# local
from utils import showsrc
from utils import correlation_table
from utils import print_results_table
from utils import plot_one_params_based_categories
from utils import plot_corr
from utils import plot_corr_saison_variable

In [None]:
# Active la recharge automatique de modules dans un Jupyter Notebook
%load_ext autoreload
# %autoreload 0 : Désactive la recharge automatique.
# %autoreload 1 : Recharge automatiquement les modules importés avec %aimport uniquement.
# %autoreload 2 : Recharge tous les modules (sauf ceux exclus par %aimport), à chaque fois que du code est exécuté.
%autoreload 2

### Paramètres

In [None]:
dataversion = "data-2310-Yewan-MissingValues-20231106"  # Data version.

In [None]:
data_directory = os.getcwd() + os.sep  + "data" + os.sep + dataversion  # Directory where we put the data.
path_data_ext =  data_directory + os.sep + 'tenerife2020_extended.csv'  # Path to extended dataset.

### Fonction pour faciliter le traitement

In [None]:
showsrc(correlation_table)

In [None]:
showsrc(print_results_table)

In [None]:
showsrc(plot_one_params_based_categories)

## Lecture des données

In [None]:
df = pd.read_csv(path_data_ext, low_memory=False, index_col="time")
df.index = pd.to_datetime(df.index, unit='ns')
with open('dimm.json', 'r') as f:
    dimm_n2s = json.load(f)
df['dimm_status'] = df.dimm_numeric.fillna(0).map(int).map(str).replace(dimm_n2s).replace(np.nan, "Unknown")
with open('season.json', 'r') as f:
    season_n2s = json.load(f)
df['season_name'] = df.season.map(str).replace(season_n2s)
cols = [i for i in df.columns.tolist() if re.match(r"cam_.*", i)]
df['cam_name'] = df[cols].idxmax(axis=1)

In [None]:
ax = df['d_ext_temp'].plot()

In [None]:
df.info()

In [None]:
df.sample(5).T

## Création de sous-dataframe par saison

In [None]:
hiver = df[df['season'] == 1] #12, 1, 2
printemps = df[df['season'] == 2] #3, 4, 5
ete = df[df['season'] == 3] #6, 7, 8
automne = df[df['season'] == 4] #9, 10, 11 

In [None]:
# Missing value investigation on four seasons
ax = msno.matrix(printemps)
ax = msno.matrix(ete)
ax = msno.matrix(automne)
ax = msno.matrix(hiver)

## Catégorisation des variables

In [None]:
features_all = []
features_turbulence_day = ['day_r0', 'day_see', 'day_see_stddev']  # La turbulence de jour: Scintillation solar sensor
features_all.extend(features_turbulence_day)
features_turbulence_night = ['night_r0', 'night_see', 'scint', 'transp'] # La turbulence de nuit: DIMM
features_all.extend(features_turbulence_night)
features_weather = ['d_wind_dir', 'humid', 'd_humid', 'press', 'irrad'] # La station météo: Weather station
features_all.extend(features_weather)
features_thermal = ['sky_temp', 'wat_col_hei', 'pyr_temp', 'down_ir']  # 1,2: zenith sensor, 10°FoV; 3,4: 52° sensor, 40°FoV
features_all.extend(features_thermal)

## Visualisation et statistique de base

### Evolution temporelle

In [None]:
# Dessinez les courbes des variables de juillet.
df_month = df[(df.month == 9)]
ax = df_month.day_r0.plot(style='.', title="$r_0$ de jour")

In [None]:
ax = df[['sun_alt', 'sky_temp']].plot(title="Altitude du soleil et température")

In [None]:
plot_one_params_based_categories(automne, 'sky_temp', 'dimm_status')

In [None]:
plot_one_params_based_categories(df, 'humid', 'season_name')

### Statistique descriptive de base

In [None]:
df.describe().T

### box-plot 

In [None]:
ax = df[['day_r0','night_r0']].boxplot(figsize=(8, 8))

### Histogrammes

In [None]:
sns.histplot(df.night_r0) #day_r0
plt.show()
sns.histplot(printemps.night_r0) #night_r0
plt.show()
sns.histplot(ete.night_r0) #night_r0
plt.show()
sns.histplot(automne.night_r0) #night_r0
plt.show()
sns.histplot(hiver.night_r0) #night_r0
plt.show()

### Camenbert

In [None]:
print("L'hiver")
t = pd.crosstab(hiver.cam_name, "freq", normalize = True)
t.plot.pie(subplots=True, figsize = (4, 4))
plt.show()
print("L'automne")
t = pd.crosstab(automne.cam_name, "freq", normalize = True)
t.plot.pie(subplots=True, figsize = (4, 4))
plt.show()

print("Le printemps")
t = pd.crosstab(printemps.cam_name, "freq", normalize = True)
t.plot.pie(subplots=True, figsize = (4, 4))
plt.show()

print("L'été")
t = pd.crosstab(ete.cam_name, "freq", normalize = True)
t.plot.pie(subplots=True, figsize = (4, 4))
plt.show()

In [None]:
print("L'hiver")
t = pd.crosstab(hiver.dimm_status, "freq", normalize = True)
t.plot.pie(subplots=True, figsize = (4, 4))
plt.show()

print("L'automne")
t = pd.crosstab(automne.dimm_status, "freq", normalize = True)
t.plot.pie(subplots=True, figsize = (4, 4))
plt.show()

print("Le printemps")
t = pd.crosstab(printemps.dimm_status, "freq", normalize = True)
t.plot.pie(subplots=True, figsize = (4, 4))
plt.show()

print("L'été")
t = pd.crosstab(ete.dimm_status, "freq", normalize = True)
t.plot.pie(subplots=True, figsize = (4, 4))
plt.show()

### Comptages des status du dimm

In [None]:
#df,hiver,automne,printemps,ete

hiver.dimm_status.unique()
print("L'hiver \n", pd.crosstab(hiver.dimm_status, "freq"),"\n")

automne.dimm_status.unique()
print("L'automne \n", pd.crosstab(automne.dimm_status, "freq"),"\n")

printemps.dimm_status.unique()
print("Le printemps \n", pd.crosstab(printemps.dimm_status, "freq"),"\n")

ete.dimm_status.unique()
print("L'été \n", pd.crosstab(ete.dimm_status, "freq"))

## Corrélations

In [None]:
corr_table = correlation_table(df, 0.7, features_all)
print_results_table(corr_table, ['Param1', 'Param2', 'Spearman'])

In [None]:
print("hiver")
plot_corr(hiver[features_all])
print("automne")
plot_corr(automne[features_all])
print("printemps")
plot_corr(printemps[features_all])
print("ete")
plot_corr(ete[features_all])
print("sur l'année")
plot_corr(df[features_all])

### Résumé des corrélations 

$\geq 0.84 \% $: 

Variables corrélées pour toute saison : `sky_temp ~ wat_col_hei` et `pyt_temp ~ down_ir`

- Hiver :  `transp ~ scint`
- Automne :  `transp ~ scint`, `d_ext_temp ~ sky_temp`,`d_ext_temp ~ wat_col_hei`
- Printemps : 
- Eté : `transp ~ scint`, `d_ext_temp ~ sky_temp`,`d_ext_temp ~ wat_col_hei`, 


In [None]:
Hiver,Automne,Printemps,Ete = [
    [hiver,'hiver'],
    [automne,'automne'],
    [printemps,'printemps'],
    [ete,'été']
]
showsrc(plot_corr_saison_variable)

### Dépendence entre 2 variables

In [None]:
plot_corr_saison_variable('wat_col_hei', 'sky_temp', Printemps)
plot_corr_saison_variable('down_ir', 'pyr_temp', Printemps)

## Dépendences au status du DIMM

Description conjointe d’un caractère quantitatif et d’un caractère qualitatif

In [None]:
df.groupby("dimm_status").mean().T

### on se concentre sur une variable avec sky_temp

In [None]:
df.groupby("dimm_status")["sky_temp"].agg([np.mean, np.std, np.median, np.min, np.max])

In [None]:
sns.histplot(data = df, x="sky_temp", hue = "dimm_status", multiple = "stack")

### Localisation des status dans l'espace des variables

Description conjointe de deux caractères quantitatifs et d’un caractère qualitatif

In [None]:
sns.relplot(x = "down_ir", y = "pyr_temp", hue = "dimm_status", data = df, height = 6, s = 30)

In [None]:
sns.lmplot(data = df, x="sky_temp", y = "wat_col_hei", hue = "dimm_status", col = "dimm_status")

## Tendance et saisonnalité

### Affichage de la tendance

In [None]:
def plot_df(df, x, y, ylabel= None, title = None, color = None):
    fig, ax = plt.subplots()
    ax.plot(x, y)
    ax.set_xlabel('time')
    if ylabel:
        ax.set_ylabel(ylabel)
    if title:
        ax.set_title(title)

    plt.xticks(
    rotation=45,
    horizontalalignment='right',
    fontweight='light',
    fontsize='medium',
)
    ax.grid(True)
    plt.show()

In [None]:
df['day'] = df.index.day
df['month'] = df.index.month
month = df['month'].unique()

plot_df(df, x=df.index, y=df.sky_temp, ylabel = "Sky temperature" ,title='Sky temperature plot',color = 'green')  

plot_df(df, x=df.index, y=df.pyr_temp, ylabel = "Pyr temperature" ,title='Pyr temperature plot', color = 'blue')  

plot_df(df, x=df.index, y=df.day_r0, ylabel = "Day r0" ,title='Day r0 plot', color = 'purple')  

plot_df(df, x=df.index, y=df.night_r0, ylabel = "Night r0" ,title='Night r0 plot')  


### Affichage de la saisonnalité (ex : en juillet)

In [None]:
plt.plot('day_r0', data=df.loc[~df.month.isin([7]), :],)
plt.title('Month-wise plot day r0 \n(The Seasonality)', fontsize=12)
plt.show()
plt.plot('night_r0', data=df.loc[~df.month.isin([7]), :],)
plt.title('Month-wise plot night r0 \n(The Seasonality)', fontsize=12)
plt.show()