In [1]:
# IMPORTS
from manufacturing_company.src.common.const import *
from manufacturing_company.src.classification_algorithms.standard_classification import *
from manufacturing_company.src.classification_algorithms.ModelInfo import ModelInfo
from manufacturing_company.src.logs.file_logger import *
import numpy as np
import pandas as pd
import operator
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier





In [7]:
# DECISION TREE
df_positions = pd.read_csv('manufacturing_company/data/raw/positions.csv', 
                           sep=';', comment='#')
df_positions = df_positions.set_index(ID)

all_months_score = dict()

for i in range(1, SIZE + 1):
    df_features = pd.read_csv('manufacturing_company/data/intermediate/05_features/' 
                              + str(i) + '_months_features.csv', sep=';')
    df_features = assign_management_levels(2, df_features, df_positions)
    
    models = classification(df_features, DecisionTreeClassifier, 
                   decision_tree_params, 'f1_macro')
    
    all_months_score[i] = models
    
save(all_months_score, DecisionTreeClassifier, 2)


Pct:  1.0
Used features:  ['in_degree', 'out_degree', 'betweenness', 'closeness', 'eigenvector', 'clustering_coeff', 'pagerank', 'hubs', 'authorities', 'max_clique', 'cliques_count', 'overtime', 'work_at_weekend', 'neighborhood_variability_sender', 'neighborhood_variability_recipient', 'neighborhood_variability_all']


Use os.path.join(memory.location, 'joblib') attribute instead.
  if memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if memory.cachedir is None:


Best score: 0.4683192103926381
Sorted:  [('eigenvector', 0.3766399137478159), ('authorities', 0.27875591818042417), ('clustering_coeff', 0.26575619731284394), ('neighborhood_variability_all', 0.07284428668004955), ('neighborhood_variability_sender', 0.00600368407886657), ('in_degree', 0.0), ('out_degree', 0.0), ('betweenness', 0.0), ('closeness', 0.0), ('pagerank', 0.0), ('hubs', 0.0), ('max_clique', 0.0), ('cliques_count', 0.0), ('overtime', 0.0), ('work_at_weekend', 0.0), ('neighborhood_variability_recipient', 0.0)]
Diff:  2
Reduced features:  ['in_degree', 'out_degree', 'betweenness', 'closeness', 'clustering_coeff', 'pagerank', 'hubs', 'max_clique', 'cliques_count', 'overtime', 'work_at_weekend', 'neighborhood_variability_sender', 'neighborhood_variability_recipient', 'neighborhood_variability_all']
Pct:  1.0
Used features:  ['in_degree', 'out_degree', 'betweenness', 'closeness', 'eigenvector', 'clustering_coeff', 'pagerank', 'hubs', 'authorities', 'max_clique', 'cliques_count', 'o

Use os.path.join(memory.location, 'joblib') attribute instead.
  if memory.cachedir is None:
Use os.path.join(memory.location, 'joblib') attribute instead.
  if memory.cachedir is None:


Best score: 0.4315224317441613
Sorted:  [('clustering_coeff', 0.42595898353248457), ('authorities', 0.30827186494883746), ('eigenvector', 0.14944560048048275), ('neighborhood_variability_all', 0.08379621794161653), ('neighborhood_variability_sender', 0.03252733309657864), ('in_degree', 0.0), ('out_degree', 0.0), ('betweenness', 0.0), ('closeness', 0.0), ('pagerank', 0.0), ('hubs', 0.0), ('max_clique', 0.0), ('cliques_count', 0.0), ('overtime', 0.0), ('work_at_weekend', 0.0), ('neighborhood_variability_recipient', 0.0)]
Diff:  2
Reduced features:  ['in_degree', 'out_degree', 'betweenness', 'closeness', 'eigenvector', 'pagerank', 'hubs', 'max_clique', 'cliques_count', 'overtime', 'work_at_weekend', 'neighborhood_variability_sender', 'neighborhood_variability_recipient', 'neighborhood_variability_all']
Pct:  1.0
Used features:  ['in_degree', 'out_degree', 'betweenness', 'closeness', 'eigenvector', 'clustering_coeff', 'pagerank', 'hubs', 'authorities', 'max_clique', 'cliques_count', 'overt

In [6]:
def assign_management_levels(levels, df_employees, df_positions):
    if levels == 2:
        df_employees[POSITION] = 2
        management_level = df_positions.index
        df_employees.loc[df_features[ID].isin(management_level), POSITION] = 1
    elif levels == 3:
        df_employees[POSITION] = 3
        
        first_management_level = df_positions[df_positions[POSITION] == 1].index
        second_management_level = df_positions[df_positions[POSITION] == 2].index
        
        df_employees.loc[df_features[ID].isin(first_management_level), POSITION] = 1
        df_employees.loc[df_features[ID].isin(second_management_level), POSITION] = 2
    else:
        raise Exception("Unsupported number of levels")
    
    return df_employees


In [5]:
def classification(df, algorithm, parameters, cv_scorer):
    models = []
    pcts = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
    
    df = df.set_index(ID)
    
    X = df.loc[:, df.columns != POSITION]
    y = df[POSITION]

    feature_names = X.columns.tolist()
    
    for pct in pcts:
        print('Pct: ', pct)
        
        print('Used features: ', feature_names)
        
        model = train(algorithm, X[feature_names], y, cv_scorer, 
                      parameters(len(feature_names)))
        
        print('Best score: ' + str(model.best_score_))
        
        modelInfo = ModelInfo(model, parameters, cv_scorer, feature_names, pct)
        models.append(modelInfo)
        
        sorted_features = sorted(
            dict(zip(feature_names, model.best_estimator_.steps[1][1].
            feature_importances_)).items(), key=operator.itemgetter(1), reverse=True)
        
        print('Sorted: ', sorted_features)
        
        diff = len(feature_names) - round(len(X.columns) * (pct - 0.1))
        
        print('Diff: ', diff)
        
        for i in range(0, diff):
            feature_to_delete = sorted_features[i][0]
            feature_names.remove(feature_to_delete)
            
        print('Reduced features: ', feature_names)
        
        return models


In [3]:
def decision_tree_params(feature_nr):
    # max_depth = np.arange(100) + 1
    # max_features = list(range(1, feature_nr))
    max_depth = [3]
    max_features = [3]
    return {'model__max_depth': max_depth, 'model__max_features': max_features}


In [7]:
def random_forest_params(feature_nr):
    n_estimators = [2, 3, 5, 10, 15, 20, 40, 60, 80, 100]
    max_depth = [1, 2, 3, 4, 5, 10, 15, 20, 25]
    max_features = list(range(1, feature_nr))

    parameters = {'model__n_estimators': n_estimators, 
                  'model__max_depth': max_depth, 
                  'model__max_features': max_features}
    return {'model__n_estimators': n_estimators, 
            'model__max_depth': max_depth, 
            'model__max_features': max_features}


In [40]:
import datetime
now = datetime.datetime.now()
now.strftime("%Y-%m-%d_%H:%M:%S" + '_' + DecisionTreeClassifier.__name__ + '_2_levels')

'2019-06-03_08:16:32_DecisionTreeClassifier_2_levels'

In [31]:
X = df_features.loc[:, df_features.columns != POSITION]
y = df_features.loc[:,POSITION]

In [7]:
all_months_score[2]

[<manufacturing_company.src.classification_algorithms.ModelInfo.ModelInfo at 0x1086247f0>]