## Simple Demo Main

In [1]:
####################################
##   Import packages and Modules
####################################
import sys
sys.path.append(".\\")

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

import data_loader
import DataAnalyser.statistics as stats
import DataAnalyser.data_viz as viz


In [2]:
help(stats)

Help on module DataAnalyser.statistics in DataAnalyser:

NAME
    DataAnalyser.statistics - Module de statistiques descriptives.

DESCRIPTION
    Usage:
        interne

FUNCTIONS
    aggregation(df, var_by)
        # aggregation stats
    
    desc_cat(df, var_cat=['mfr', 'type', 'shelf'])
        # var_cat stats
    
    desc_num(df, var_num=['calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars', 'potass', 'vitamins', 'rating'])
        # var_num stats

DATA
    __authors__ = 'Romeo NOUDOFININ'
    __contact__ = ('email@gmail.com', 'email@company.com')
    __copyright__ = 'copyleft'

VERSION
    1.0.0

DATE
    2021/01

FILE
    c:\users\rnd\desktop\romeo-noud\training\1_python\pgms\python_approfondissement\04_projet_python\tp_simple_python_project\cerealanalysis-repo\cerealanalysis\dataanalyser\statistics.py




In [3]:
####################################
##   Data Paths
####################################

#input_data = ".\\cereal_analysis\\data\\input\\"
input_data = ".\\data\\input\\"
output_data = ".\\data\\output\\"


In [4]:
####################################
##   Loading Checking Data
####################################
df = data_loader.load_data(input_data+"cereal_data.csv")
variable_ok = data_loader.check_data(df, input_data+"relevant_col.txt")

Data columns: ['name', 'mfr', 'type', 'calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars', 'potass', 'vitamins', 'shelf', 'weight', 'cups', 'rating', 'cereal']
Mandatory columns: ['calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars', 'potass', 'vitamins', 'rating', 'mfr', 'type', 'shelf']
data_columns contains all mandatory columns


In [5]:
df.head(3)

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,cereal
0,100% Bran,Nabisco,Cold,70,4,1,130,10.0,5.0,6,280,25,Top,1.0,0.33,68.4,1
1,100% Natural Bran,Quaker Oats,Cold,120,3,5,15,2.0,8.0,8,135,0,Top,1.0,1.0,33.98,1
2,All-Bran,Kellogs,Cold,70,4,1,260,9.0,7.0,5,320,25,Top,1.0,0.33,59.43,1


In [6]:
####################################
##   Stats & Viz
####################################

if variable_ok == True:

    # var_num stats
    df_desc_num = stats.desc_num(df)

    # var_cat stats
    df_desc_cat = stats.desc_cat(df)

    # aggregation stats, by fat
    agg_fat = stats.aggregation(df, "fat")

    # aggregation stats, by mfr
    agg_mfr = stats.aggregation(df, "mfr")


    
    
    # Rating distribution
    distribution_rating = viz.var_distribution(df, var_name='rating')

    # scatter plot entre rating and sugars
    scatter_rating_sugars = viz.scatter(df, var_name1='sugars', var_name2='rating', var_hover='name')

    # scatter plot entre rating and sugars + boxplots pour sugar + violins pour rating
    scatter2_rating_sugars = viz.scatter_plus(df, var_name1="sugars", var_name2="rating", var_color="shelf")

    # Boxplots pour le rating, par mfr
    box_rating = viz.box_plot(df, var_name="rating", var_color="mfr")

    # Un treemap pour illustrer la nature hiérarchique des données
    treemap = viz.treemap(df, hierarchy=['shelf', 'mfr'], var_value='cereal', the_title = 'Cereals by shelf location')

    # Un Sunburst pour illustrer la nature hiérarchique des données
    sunburst = viz.sunburst(df, hierarchy=['shelf', 'mfr'], var_value='cereal', the_title = 'Cereals by shelf location')

    # Bubble chart pour rating et sugars
    Bubble_rating_sugars = viz.bubble(df, var_name1='sugars', var_name2='rating', var_color='mfr', var_size='calories',
                                    var_facet='shelf', var_hover='name', orders={'shelf': ['Top', 'Middle', 'Bottom']})


In [7]:
####################################
##   Save Results
####################################

# Saving tables into excel files
df_desc_num.to_csv(output_data+'cereal_desc_num.csv', sep=';', index=False, encoding='utf-8')
df_desc_cat.to_csv(output_data+'cereal_desc_cat.csv', sep=';', index=False, encoding='utf-8')

agg_fat.to_csv(output_data+'agg_fat.csv', sep=';', index=False, encoding='utf-8')
agg_mfr.to_csv(output_data+'agg_mfr.csv', sep=';', index=False, encoding='utf-8')


# Saving visualisation into unique html file
with open(output_data+'cereal_reporting.html', 'a') as f:
    f.write(distribution_rating.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(scatter_rating_sugars.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(scatter2_rating_sugars.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(box_rating.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(treemap.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(sunburst.to_html(full_html=False, include_plotlyjs='cdn'))
    f.write(Bubble_rating_sugars.to_html(full_html=False, include_plotlyjs='cdn'))