* [Import Packages and Read Data](#0.1)
* [Define Visualisation Functions](#0.2)
* [Plot Time Series](#0.3)
* [Plot Forecast Values using ARIMA](#0.4)


# Import Packages and Read Data <a class="anchor" id="0.1"></a>

In [None]:
import pandas as pd
from tqdm import tqdm
from pandas.api.types import is_numeric_dtype, is_string_dtype
from matplotlib import pyplot as plt
import matplotlib
import numpy as np
import os

# install forecast tool and pmdarima
!pip install -q forecast-tool
!pip install -q pmdarima
from forecast_tool import forecast_plot as fp

def shape(df,df_name):
    print(f'STATUS: Dimension of "{df_name}" = {df.shape}')
def w(text):
    return print(f'STATUS: {text}')

# get all files location path
all_files =[]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path  =(os.path.join(dirname, filename))
        all_files.append(path)

# read all csv
w('Reading all files...')
df_list = []
for filename in tqdm(all_files):
    df = pd.read_csv(filename, index_col=None, header=0)
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df["Date"],errors='coerce')
        df = df.dropna()
    else:
        df['DATE'] = pd.to_datetime(df["DATE"],errors='coerce')
        df = df.dropna()
    df_list.append(df)
w('Reading all files - Done')

# get all files name
w('Get all files name ...')
titles = []
for mystring in all_files:
    _, _, after_keyword = mystring.partition('1990-2022/')
    before_keyword, _, _ = after_keyword.partition('.csv')
    titles.append(before_keyword)
w('Get all files name - Done')

# get all targets
w('Get all targets ...')
# GET TARGETS
target_list= []
for df in df_list:
    if 'INR' in df.columns:
        target_list.append('INR')
    elif 'Div Yield %' in df.columns:
        target_list.append('Div Yield %')
    elif 'Open' in df.columns:
        target_list.append('Open')
    elif 'OPEN' in df.columns:
        target_list.append('OPEN')
w('Get all targets - Done')

# Get all dates
w('Get all dates ...')
date_list = []
for df in df_list:
    if 'Date' in df.columns:
        date_list.append('Date')
    elif 'DATE' in df.columns:
        date_list.append('DATE')
w('Get all dates - Done')

#Make sure all targets are numeric
w('Make sure all targets are numeric ...')
for df, target, date in zip(df_list, target_list,date_list):
    if is_string_dtype(df[target]):
        df[target] = df[target].str.replace(',', '')
        df[target] = pd.to_numeric(df[target],errors='coerce')
w('Make sure all targets are numeric -Done')  
    
print(f"STATUS: {len(titles)} == number of FILES: {len(all_files)}\
== number of TARGETS: {len(target_list)} == Date list: {len(date_list)}")

# get dimension of each file
shape_list =[]
for df in df_list:
    shape_list.append(df.shape)
dimension_df = pd.DataFrame({'files':titles,'dimension':shape_list})
display(dimension_df)

# read a sample data
df_list[0].head()

# Define Visualisation Functions <a class="anchor" id="0.2"></a>

In [None]:
def aggregate_period(df, period, date, target): 
    per = df[date].dt.to_period(period)
    g = df.groupby(per).agg({target: ['sum']})
    g.columns = ['metric']
    g.reset_index(level=0, inplace=True)
    g[date]=g[date].astype(str)
    return g

def visual(df, DATE, target, title, n, l, numElems, scientific_annotation):
    date = df[DATE][-n:] ; count_period =df['metric'][-n:]

    plt.plot(date, count_period, linestyle='solid', color='black', marker='o')
    plt.title(f'{title}')
    plt.xlabel('Time')
    plt.ylabel(target)
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(12.5, 5.5)
        # To get number of annotations, if 0, then no annotations
    if numElems>0:
        
        # to get number of annotations
        idx = np.round(np.linspace(0, len(df) - 1, numElems)).astype(int)
        
        # to loop all values to plot
        for x,y in zip(date[idx],count_period[idx]):
            
            # if number of digits too long, make it scientific annotations
            if (len(str(y)) >5) & (scientific_annotation ==True):
                label = '{:0.2e}'.format(y)
                plt.annotate((label),
                     (x,y), # these are the coordinates to position the label
                     textcoords="offset points", # how to position the text
                     xytext=(2,10), # distance from text to points (x,y)
                     ha='center' , fontsize=9) # horizontal alignment can be left, right or center
            else:
                label = y
                plt.annotate((label),
                     (x,y), # these are the coordinates to position the label
                     textcoords="offset points", # how to position the text
                     xytext=(2,10), # distance from text to points (x,y)
                     ha='center' , fontsize=9) # horizontal alignment can be left, right or center
    overall_mean180 = count_period.mean()
    plt.fill_between(date, count_period,overall_mean180,where=(count_period>overall_mean180),alpha=0.10, interpolate=True, color='Green')
    plt.fill_between(date, count_period,overall_mean180,where=(count_period<=overall_mean180),alpha=0.10, interpolate=True, color='Red')

    plt.legend(['number of movies', 'above average', 'below average'],loc=l, prop={'size': 8})
    plt.xticks(rotation=90)
    
    IQR = df.metric.quantile(0.75) - df.metric.quantile(0.25)
    min_y = df.metric.min()-IQR
    max_y = df.metric.max()+IQR
    plt.ylim([min_y, max_y])
    
    return plt.show()

# Plot Time Series <a class="anchor" id="0.3"></a>

In [None]:
matplotlib.style.use('ggplot')
counter =0
for df, target, date, title in zip(df_list, target_list, date_list, titles):
    if title == "NIFTY INDIA MANUFACTURING - HistoricalPE_PBDIV_Data":
        pass
    elif title == 'NIFTY HOUSING_Data':
        pass
    else:
        print(title)
        y = aggregate_period(df, 'y', date, target)
        my_title = f"{target} - {title}"
        visual(y ,date,target,my_title,100,1,10, scientific_annotation=False)

# Plot Forecast Values using ARIMA <a class="anchor" id="0.4"></a>

In [None]:
for df, target, date, title in zip(df_list, target_list, date_list, titles):
    try:
        fp.overall_vis(df, date, target, 'y', my_title =f"Forecasted {target} - {title}", matplotlib_style ='ggplot')
    except:
        print(f"STAUS - An error occured for {title}")