In [182]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#from pandas_profiling import ProfileReport
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
import datetime 
from dateutil.parser import parse
import nbformat
from colour import Color
import random

##how to put graphs into powerpoints
#https://evidencen.com/how-to-embed-plotly-graphs-in-powerpoint/

In [183]:
#mass index modifier
def set_indexes(dfs, index_list):
    for df, index in zip(dfs, index_list):
        df.set_index(index, inplace=True, drop=True)
def size_date_cumulative_creator(column_name, df):
    array = list(df[column_name].unique())
    #data wrangling
    temp_array = []
    for item in array:
        temp_df = pipeline[pipeline[column_name] == item].groupby([column_name, 'Date']).size().cumsum().to_frame('size').reset_index().rename(columns={'size': '{}'.format(item)}).drop(column_name, axis=1)
        temp_array.append(temp_df)

    #merge all tables in a proper way
    for index, table in enumerate(temp_array):
        if index == 0:
            merged_df = pd.merge(
                temp_array[index],
                temp_array[index + 1],
                how="outer",
                on="Date",
                sort=True,
                suffixes=("_x", "_y"),
            )
        else:
            merged_df = pd.merge(
                merged_df,
                temp_array[index],
                how="outer",
                on="Date",
                sort=True,
                suffixes=("_x", "_y"),
            )
        
    result_df = merged_df.rename(columns={'{}_x'.format(array[1]): array[1]}).drop('{}_y'.format(array[1]), axis=1)

    #null handling
    result_df.fillna(method='ffill', inplace=True)
    result_df.fillna(0, inplace=True)

    #output CSV
    result_df.to_csv(os.path.join(OUTPUT_DATA_FOLDER, 'cumulative_data_{}.csv'.format(column_name)))

    #create the table figure
    fig = go.Figure()

    #get color scheme
    red = Color("red")
    colors = list(red.range_to(Color("black"),10))
    colors_array = random.sample(colors, len(array))
    
    #add traces
    for index, col in enumerate(array):
        fig.add_trace(
            go.Scatter(
                x=result_df['Date'],
                y=result_df[col],
                name=col,
                marker_color= str(colors_array[index])
            )
    )

    #update layout
    fig.update_layout(title_text='Cumulative Analysis - {}'.format(column_name), title_x=0.5, title_y=0.90, xaxis_title="Date", xaxis_title_standoff=0.6, yaxis_title_standoff=0.6, yaxis_title=column_name)
    
    #output HTML
    fig.write_html(os.path.join(REPORT_FOLDER, 'cumulative_analysis_{}.html'.format(column_name)))
    fig.show()

    #return result_df

In [184]:
RAW_DATA_FOLDER = 'data/raw/'
OUTPUT_DATA_FOLDER = 'data/output/'
REPORT_FOLDER = 'reports/'

pipeline = pd.read_csv(os.path.join(RAW_DATA_FOLDER, 'pipeline.csv'))

#standardize the date input
pipeline.rename(columns={'Day Created': 'Date'}, inplace=True)
pipeline['Date'] = pipeline['Date'].str.replace('.', '-')
pipeline['Date'] = pipeline['Date'].str.replace('/', '-')
pipeline.loc[~pipeline['Country'].isin(country_list), 'Country'] = 'Other'
#get rid of irrelevant rows
pipeline.dropna(subset = ['Sector'], inplace=True)
#adjust time format
pipeline["Date"] = pd.to_datetime(pipeline["Date"]).dt.strftime('%Y-%m-%d')
pipeline["Date"] = pipeline["Date"].apply(lambda x : parse(x))

In [185]:
#sectors and countries that we are interested
sector_list = list(pipeline['Sector'].unique())
country_list = list(pipeline['Country'].unique())
origination_list = list(pipeline['Origination Type'].unique())

In [186]:
size_date_cumulative_creator('Sector', pipeline)