# manage utils dir

In [None]:
print(__doc__)

from itertools import islice;
from pprint import pprint;
from sklearn import preprocessing;

import copy; import os;
import sys; import shutil;
import time;

import numpy as np; import pandas as pd;

%matplotlib inline
# Matplotlib pyplot provides plotting API
import matplotlib as mpl
from matplotlib import pyplot as plt
import chart_studio.plotly.plotly as py
import seaborn as sns; sns.set()

In [None]:
def dir_traversal_by_os_walk(root_dir_path: str, verbose: int = 0) -> list:
    resources_list: list = [(root, dirs, files) for root, dirs, files, in os.walk(root_dir_path)]  
    
    if verbose == 1:
        print("List of all sub-directories and files:")  
        for (root, dirs, files)  in resources_list: 
            print('Root:', root)
            print('Directories:', dirs)
            print('Files:', files)
    return resources_list

def get_df_from_list_of_os_walk_numeric(resources_list: list, columns="root,dirs,files", verbose: int = 1) -> pd.DataFrame:
    if type(columns) is not list:
        columns = "root,dirs,files".split(",")
    stats_list: list = list(map(lambda record: (record[0], len(record[1]), len(record[2])), resources_list))
    df: pd.DataFrame = pd.DataFrame(data=stats_list, columns=columns)
    return df

def get_df_from_list_of_os_walk(resources_list: list, columns="root,dirs,files", verbose: int = 1) -> pd.DataFrame:
    data: list = list()
    if type(columns) is not list:
        columns = "root,dirs,files".split(",")
    for _, (root, dirs, files) in enumerate(resources_list):
        for _, a_file in enumerate(files):
            a_record: list = [root, os.path.dirname(a_file), os.path.basename(a_file)]
            data.append(a_record)
            pass
        pass
    df: pd.DataFrame = pd.DataFrame(data=data, columns=columns)
    return df

def get_df_from_list_of_os_walk_numeric_indexed(resources_list: list, columns="dirs,files", verbose: int = 1) -> pd.DataFrame:
    if type(columns) is not list:
        columns = "root,dirs,files".split(",")
    stats_list: list = list(map(lambda record: (len(record[1]), len(record[2])), resources_list))
    index_list: list = list(map(lambda record: record[0], resources_list))
    
    df: pd.DataFrame = pd.DataFrame(data=stats_list, columns=columns, index=index_list)
    return df

In [None]:
ROOT_DIR_PATH = "C:\\Users\\Francesco\\Desktop" # "."
resources_list = dir_traversal_by_os_walk(root_dir_path=ROOT_DIR_PATH, verbose=0)

In [None]:
df_indexed = get_df_from_list_of_os_walk_numeric_indexed(resources_list=resources_list, columns="dirs,files".split(","))
assert df_indexed is not None, "df_indexed is None"

In [None]:
df_indexed.info()

In [None]:
df_indexed.head(5)

In [None]:
criteria = (df_indexed["dirs"] != 0) & (df_indexed["files"] != 0)
df_indexed[criteria]

In [None]:
df = get_df_from_list_of_os_walk_numeric(resources_list=resources_list)
assert df is not None, "df is None"

In [None]:
df.info()

In [None]:
# print(df.head(df.shape[0]))
df.head(5)

In [None]:
df[["dirs", "files"]].agg(['sum', 'max', 'min', 'mean', 'std'])

In [None]:
df[["dirs", "files"]].describe(include='all')

In [None]:
res_stats = df[["dirs", "files"]].describe(include='all')
loc_max: int = res_stats.index.get_loc('max')
loc_min: int = res_stats.index.get_loc('min')
max_val = int(max(res_stats.loc[res_stats.index[loc_max]]))
min_val = int(min(res_stats.loc[res_stats.index[loc_min]]))
bins = range(min_val, max_val)
df[["dirs", "files"]].hist(bins=bins)

In [None]:
type(res_stats.index)

In [None]:
def file2ext(file_name: str) ->str:
    tmp_res: str = os.path.basename(file_name)
    filename, file_extension = os.path.splitext(tmp_res)
    if len(file_extension) == 0: return "-"
    return file_extension
df = get_df_from_list_of_os_walk(resources_list=resources_list, columns="root,dirs,files", verbose=0)

In [None]:
df.info()

In [None]:
# type(df[["files"]].applymap(file2ext))
pd.unique(df[["files"]].applymap(file2ext)["files"].values)

In [None]:
# type(df[["files"]].applymap(file2ext)["files"].value_counts())
df[["files"]].applymap(file2ext)["files"].value_counts().to_frame().T

In [None]:
# df[["files"]].applymap(file2ext)["files"].value_counts().to_frame().hist()
ext_df: pd.DataFrame = df[["files"]].applymap(file2ext)
predictor = ext_df["files"].value_counts()
sns.barplot(predictor.index, predictor.values, alpha=0.9)

In [None]:
type(predictor)

In [None]:
tmp_df = pd.DataFrame(data=predictor.values, columns=["File Ext"], index=predictor.index)
tmp_df.plot.pie(y='File Ext', figsize=(5, 5))

In [None]:
pred_rescaled = preprocessing.normalize(predictor.values[:,np.newaxis], axis=0).ravel()
sns.barplot(predictor.index, pred_rescaled, alpha=0.9)

In [None]:
tmp_df = pd.DataFrame(data=predictor.values, columns=["File Ext"], index=predictor.index)
tmp_df.plot.pie(y='File Ext', figsize=(5, 5))

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
a_scaler = min_max_scaler
pred_rescaled = a_scaler.fit_transform(predictor.values[:,np.newaxis]).ravel()
sns.barplot(predictor.index, pred_rescaled, alpha=0.9)

In [None]:
tmp_df = pd.DataFrame(data=predictor.values, columns=["File Ext"], index=predictor.index)
tmp_df.plot.pie(y='File Ext', figsize=(5, 5))

In [None]:
standard_scaler = preprocessing.StandardScaler()
a_scaler = standard_scaler
pred_rescaled = a_scaler.fit_transform(predictor.values[:,np.newaxis]).ravel()
sns.barplot(predictor.index, pred_rescaled, alpha=0.9)

In [None]:
tmp_df = pd.DataFrame(data=predictor.values, columns=["File Ext"], index=predictor.index)
tmp_df.plot.pie(y='File Ext', figsize=(5, 5))

## References:

## Scikit-Learn:
### Objects:
    - (StandardScaler) https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html