In [1]:
# IMPORTS
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from enron.src.common.const import *
from enron.src.classification_algorithms.assign_management_levels import assign_management_levels_cc
from enron.src.visualization.random_baseline_enron import *

from manufacturing_company.src.common.const import *
from manufacturing_company.src.classification_algorithms.standard_classification import classification
from manufacturing_company.src.logs.standard_classification_logger import StandardClassificationLogger
from manufacturing_company.src.visualization.plot_standard_classification import PlotStandardClassification

import warnings
warnings.filterwarnings(action='once')




In [4]:
# CONST
df_positions = pd.read_csv(ENRON_FILE_FLATTEN_HIERARCHY, sep=';', index_col=ID)
df_positions = df_positions[[FLATTEN_POSITION]]
df_positions.columns = [POSITION]

levels = 2

In [None]:
def decision_tree_params(n_features):
    max_depth = np.linspace(1, 20, 20, endpoint=True)
    max_features = list(range(1, n_features))
    return {'model__max_depth': max_depth, 'model__max_features': max_features}


In [1]:
# DECISION TREE
logger = StandardClassificationLogger('enron', levels, DecisionTreeClassifier)

for month in range(MONTHS):
    features = pd.read_csv(ENRON_FILE_FEATURES.format(month), sep=';', index_col=ID)
    features = assign_management_levels(levels, features, positions)

    models = classification(features, DecisionTreeClassifier, decision_tree_params, 'f1_macro', logger, month)


plot = PlotStandardClassification(logger.directory_path, DecisionTreeClassifier, levels, random_baseline_enron)
plot.plot()


In [3]:
def random_forest_params(n_features):
    n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
    max_depth = np.linspace(1, 20, 20, endpoint=True)
    max_features = list(range(1, n_features))
    return {'model__n_estimators': n_estimators, 
            'model__max_depth': max_depth, 
            'model__max_features': max_features}


In [None]:
# RANDOM FOREST

logger = StandardClassificationLogger('enron', levels, RandomForestClassifier)

for month in range(MONTHS):
    features = pd.read_csv(ENRON_FILE_FEATURES.format(month), sep=';', index_col=ID)
    features = assign_management_levels(levels, features, positions)

    models = classification(features, RandomForestClassifier, decision_tree_params, 'f1_macro', logger, month)

plot = PlotStandardClassification(logger.directory_path, RandomForestClassifier, levels, random_baseline_enron)
plot.plot()
