In [None]:
!pip install matplotlib --quiet
!pip install seaborn --quiet
!pip install plotly --quiet

In [112]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import datetime
import pandas as pd
import numpy as np

filename = "../raw_data/20220602.csv"

def load_metadata_for_vis(filename):
    data = pd.read_csv(filename)
    data['date'] = pd.to_datetime(data['date'])
    data = data[~data["dir_1"].isna()].reset_index().drop(columns = "index") # drop rows that have NA in dir_1 column
    return data
    
def exploration_histogram(data):
    fig = plt.figure(figsize=(12,7))
    return sns.histplot(data['date'], bins = 50)

def exploration_valcounts(data):
    '''
    categories (dir_1) with the number of publications
    '''
    return data['dir_1'].value_counts()

def subset_data(data, start_date="2011-01-01", end_date="2021-12-31", timesampling="Y"):
    '''
    filtering dir_1 data on start- and end-time and whether 1 datapoint per month or per year
    '''
    start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    data_subset = data[np.logical_and(data['date'] >= start_date, data['date'] <= end_date)]
    # categories (dir_1) in descending frequencies
    dir_1 = data['dir_1'].value_counts().index
    #create dataframe for the others to append to and rename col to dir name
    df = data_subset[data_subset['dir_1'] == dir_1[0]].resample(timesampling, on='date')['title'].count().reset_index().rename(columns={'title':dir_1[0]})
    # create dataframe with publications per directory
    for i in range(1,len(dir_1)):
        category = dir_1[i]
        temp = data_subset[data_subset['dir_1'] == category].resample(timesampling, on='date')['title'].count().reset_index().rename(columns={'title':category})
        df = df.merge(temp, how='left', on='date').fillna(0)
    data_publications = pd.concat([df['date'], df.drop(columns = "date").astype('Int64')], axis=1)
    return data_publications

def visualization_piechart(data_publications):
    '''
    Comparison of Directory Frequency in Pie Chart
    '''
    piedata = data_publications.drop(columns='date').sum().reset_index()
    fig = px.pie(piedata, values=0, names='index', title='Directories of published documents')
    return fig.show()

def visualization_stackedarea(data_publications, plottype="plotly"):
    '''
    stacked area plot in either plotly (interactive) or matplotlib
    '''
    # prepare data
    x = data_publications['date'].tolist() 
    y = data_publications.drop(columns = {"date"}).T.values.tolist()
    labels = data_publications.drop(columns = {"date"})
    # matplotlib
    if plottype == "matplotlib":
        fig = plt.figure(figsize=(12,7))
        plt.stackplot(x,y, labels=labels)
        plt.legend()
        plt.xlabel("Date of Publication")
        plt.ylabel("Number of Publications")
        plt.title(f"Publication of EU-Regulations per Directory (stacked)")
        return plt.show()     
    # plotly
    elif plottype == "plotly":
        # create dict for the labels in plotly
        newnames = {}
        for index in range(0,len(labels.columns)):
            newnames[f"wide_variable_{str(index)}"] = labels.columns[index]
        # plot
        x_plot = x.copy()
        y_plot = y.copy()
        fig = px.area(x=x_plot, y=y_plot,
                      labels={"x": "Date of Publication",
                             "value": "Number of Publications",
                             "variable": "Category"},
                      title='Publication of EU-Regulations per Directory (stacked)')
        fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                              legendgroup = newnames[t.name],
                                              hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])))
        return fig.show()
    else:
        return "please select either 'matplotlib' or 'plotly' as plottype"
    
def visualization_stackedarea_normalized(data_publications, plottype="plotly"):
    '''
    normalized stacked area plot in either plotly (interactive) or matplotlib
    '''
    #normalize 
    df = data_publications.drop(columns = {'date'})
    data_publications_normalized = df.div(df.sum(axis=1), axis=0)
    y_norm = data_publications_normalized.T.values.tolist()
    # prepare data
    x_norm = data_publications['date'].tolist() 
    labels = data_publications.drop(columns = {"date"})
    # matplotlib
    if plottype == "matplotlib":
        # matplotlib
        fig = plt.figure(figsize=(12,7))
        plt.stackplot(x_norm, y_norm, labels=labels)
        plt.legend()
        plt.xlabel("Date of Publication")
        plt.ylabel("Share of Publications in this Directory")
        plt.title(f"Publication of EU-Regulations per Directory (stacked and normalized)")
        return plt.show()
    # plotly
    elif plottype == "plotly":
        # create dict for the labels in plotly
        newnames = {}
        for index in range(0,len(labels.columns)):
            newnames[f"wide_variable_{str(index)}"] = labels.columns[index]
        # plot
        fig = px.area(x=x_norm, y=y_norm,
              labels={"x": "Date of Publication",
                     "value": "Share of Publications in this Directory",
                     "variable": "Category"},
              title='Publication of EU-Regulations per Directory (stacked and normalized)')
        fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                              legendgroup = newnames[t.name],
                                              hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])))
        return fig.show()
    else:
        return "please select either 'matplotlib' or 'plotly' as plottype"

In [113]:
# idea: add ", directory="dir_1"" bzw. dir_2 in load_metadata_for_vis? then select for sub-dirs

data_d2 = load_metadata_for_vis(filename)

In [114]:
data

Unnamed: 0.1,Unnamed: 0,title,cellar,date,dir_code,dir_1,dir_2,dir_3,dir_4,dir_5,dir_6
0,0,Commission Regulation (EU) No 965/2012 of 5 Oc...,3cb645b2-2a23-43c9-b842-b03665a6733a,2013-06-26,74030.0,Transport policy,Air transport,Air safety,,,
1,1,Regulation (EU) No 600/2014 of the European Pa...,3b729ddf-f1f7-11e3-8cd4-01aa75ed71a1,2014-06-12,6202025.0,Right of establishment and freedom to provide ...,Sectoral application,Service activities,Stock exchanges and other securities markets,,
2,2,Regulation (EU) No 1151/2012 of the European P...,8917d52e-4432-11e2-9b3b-01aa75ed71a1,2014-10-28,152020.0,"Environment, consumers and health protection",Consumers,"Consumer information, education and representa...",,,
3,3,Commission Regulation (EU) No 1178/2011 of 3 N...,ff333821-8325-4fcd-aad0-0a6f3913a242,2011-11-25,74030.0,Transport policy,Air transport,Air safety,,,
4,4,Regulation (EU) 2019/1009 of the European Parl...,e351eb07-9713-11e9-9369-01aa75ed71a1,2019-06-25,133019.0,Industrial policy and internal market,Internal market: approximation of laws,Fertilisers,,,
...,...,...,...,...,...,...,...,...,...,...,...
15274,6,Commission Regulation (EU) No 1181/2010 of 13 ...,a76c2e16-da57-4fa5-bfd4-27bb57f7d0d3,2010-12-15,4103010.0,Fisheries,Common fisheries policy,Conservation of resources,Catch quotas and management of stocks,,
15275,10,Commission Regulation (EU) No 606/2011 of 20 J...,d86d6c99-90ca-4ca7-8b37-14c6b17e04dc,2011-06-23,4103010.0,Fisheries,Common fisheries policy,Conservation of resources,Catch quotas and management of stocks,,
15276,16,Commission Implementing Regulation (EU) No 357...,ad175ddd-31fe-477f-9b8c-3af30ac242e1,2013-07-16,36059.0,Agriculture,Products subject to market organisation,Oils and fats,,,
15277,18,Commission Regulation (EU) No 790/2010 of 7 Se...,a1844b60-f893-424c-adb4-8fb28487b3bb,2010-09-08,35030.0,Agriculture,Approximation of laws and health measures,Animal health and zootechnics,,,


In [None]:
def choose_subdir: