In [None]:
%pip install mlxtend --upgrade

In [67]:
import pandas as pd
from datetime import datetime
import seaborn as sns

import re
import numpy as np
from collections import Counter

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score, precision_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, make_scorer
from sklearn.compose import TransformedTargetRegressor

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

import matplotlib.pyplot as plt

# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# from keras import Input, Model, optimizers, callbacks
# from keras.layers import Bidirectional, LSTM, Dense, Concatenate
# from keras import backend as K

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [121]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/final_combind.csv")
# df = df.drop(['Unnamed: 0'], axis=1)
# df = df[df['Year']>2019]
df = df.dropna()
df = df[df['Number of Workers']>0]
# df[df['Number of Workers']>16000]
df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Quarter,Number of Workers,Number of Layoffs,revenue,costOfRevenue,grossProfit,grossProfitRatio,ResearchAndDevelopmentExpenses,...,freeCashFlow,employee_count,percent_layoff,industry_labelled,new_cases,new_cases_smoothed,new_cases_per_million,new_deaths,new_deaths_smoothed,new_deaths_per_million
count,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,...,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0,1462.0
mean,843.616963,2015.253762,2.407661,268.23461,1.441176,7777767000.0,5191190000.0,2363358000.0,0.25975,212271000.0,...,606877600.0,76890.95,0.095515,284.163475,17419.430651,17404.778889,51.49262,263.895534,262.732175,0.780093
std,486.717864,5.277959,1.111521,819.088999,1.314491,16205920000.0,12270200000.0,4969106000.0,1.331939,567436900.0,...,1609967000.0,246761.9,1.024988,171.006502,44174.395601,44896.022382,130.581491,547.688827,548.367826,1.619003
min,0.0,2000.0,1.0,1.0,1.0,-1500000000.0,-288047000.0,-6337000000.0,-48.967742,0.0,...,-7297000000.0,4.0,7e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,421.25,2012.0,1.0,62.0,1.0,649925500.0,324155500.0,117200200.0,0.134756,0.0,...,-8755250.0,5728.5,0.00164,124.25,0.0,0.0,0.0,0.0,0.0,0.0
50%,857.0,2017.0,2.0,111.0,1.0,2716686000.0,1475294000.0,597497000.0,0.275973,0.0,...,95706500.0,21566.0,0.005939,307.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1263.75,2020.0,3.0,231.0,1.0,8617500000.0,4663750000.0,2229000000.0,0.445867,21664500.0,...,671000000.0,70447.0,0.021481,419.75,2404.763889,1538.313444,7.108639,46.208333,25.271847,0.136639
max,1686.0,2022.0,4.0,16337.0,16.0,152859000000.0,115838000000.0,37021000000.0,2.229273,3948000000.0,...,11221000000.0,2300000.0,32.5,578.0,287196.233333,298754.488933,848.964922,2236.066667,2291.728556,6.609944


In [104]:
def forward_elimination_feature_selection(inp_df, num_features):
    input_df = inp_df.drop(columns={'Number of Workers','percent_layoff','Unnamed: 0','Number of Layoffs'})
    X = input_df
    y = inp_df['Number of Workers']

    #Define Sequential Forward Selection (sfs)
    sfs = SFS(GradientBoostingRegressor(),
            k_features=num_features,
            forward=True,
            floating=False,
            scoring = 'r2',
            cv = 0)
    #Use SFS to select the top 5 features 
    sfs.fit(X, y)

    #Create a dataframe for the SFS results 
    df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
    
    #plot
    fig = plot_sfs(sfs.get_metric_dict(), kind='std_err', figsize=(25,10))
    plt.title('Sequential Forward Selection (w. StdErr)')
    plt.grid()
    plt.show()
    df_SFS_results = df_SFS_results.get(["avg_score","feature_names"]).reset_index().rename(columns={'index': 'label'})
    df_SFS_results['label'] = 'Label_' + df_SFS_results['label'].astype(str)
    return df_SFS_results


In [130]:
y = df['Number of Workers']
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
sfs_results = forward_elimination_feature_selection(X_train, 15)

In [94]:
sfs_results['feature_names'].values

array([('goodwillAndIntangibleAssets',),
       ('operatingIncomeRatio', 'goodwillAndIntangibleAssets'),
       ('operatingIncomeRatio', 'goodwillAndIntangibleAssets', 'otherWorkingCapital'),
       ('operatingIncomeRatio', 'goodwillAndIntangibleAssets', 'otherWorkingCapital', 'effectOfForexChangesOnCash'),
       ('operatingIncomeRatio', 'goodwillAndIntangibleAssets', 'otherWorkingCapital', 'effectOfForexChangesOnCash', 'capitalExpenditure'),
       ('operatingIncomeRatio', 'goodwill', 'goodwillAndIntangibleAssets', 'otherWorkingCapital', 'effectOfForexChangesOnCash', 'capitalExpenditure'),
       ('operatingIncomeRatio', 'goodwill', 'goodwillAndIntangibleAssets', 'otherWorkingCapital', 'debtRepayment', 'effectOfForexChangesOnCash', 'capitalExpenditure'),
       ('depreciationAndAmortization', 'operatingIncomeRatio', 'goodwill', 'goodwillAndIntangibleAssets', 'otherWorkingCapital', 'debtRepayment', 'effectOfForexChangesOnCash', 'capitalExpenditure'),
       ('depreciationAndAmortizati

In [123]:
# features = ['depreciationAndAmortization', 'operatingIncomeRatio', 'totalOtherIncomeExpensesNet', 'incomeBeforeTax', 'goodwill', 'goodwillAndIntangibleAssets', 'longTermDebt', 'accumulatedOtherComprehensiveIncomeLoss', 'otherWorkingCapital', 'acquisitionsNet', 'salesMaturitiesOfInvestments', 'debtRepayment', 'effectOfForexChangesOnCash', 'capitalExpenditure', 'employee_count']
features = ['grossProfit', 'weightedAverageShsOutDil', 'otherCurrentAssets', 'intangibleAssets', 'goodwillAndIntangibleAssets', 'longTermInvestments', 'totalNonCurrentLiabilities', 'netDebt', 'inventory_cash-flow-statement', 'employee_count']
# features = ['otherExpenses', 'EBITDA', 'inventory', 'goodwill', 'retainedEarnings']
X_train = X_train[features]
X_test = X_test[features]
# y = df['Number of Workers']

In [124]:
def do_regression_grid_search(X_train, X_test, y_train, y_test, model, param_grid, scoring, refit):
    # X_train, X_test, y_train, y_test = \
    #     train_test_split(X, y, test_size=0.2)
    wrapped_model = TransformedTargetRegressor(regressor=model, transformer=StandardScaler())

    gs = GridSearchCV(
        wrapped_model,
        param_grid=param_grid,
        scoring=scoring,
        refit=refit,
        n_jobs=-1,
        return_train_score=True,
        cv=5
    )
    gs.fit(X_train, y_train)

    y_test_hat = gs.predict(X_test)
    # The mean squared error
    print("MSE: %.2f" % mean_squared_error(y_test, y_test_hat))
    # The coefficient of determination: 1 is perfect prediction
    print("R2: %.2f" % r2_score(y_test, y_test_hat))
    print("MAPE: %.2f" % mean_absolute_percentage_error(y_test, y_test_hat))

In [125]:
scoring = {"mse": make_scorer(mean_squared_error, greater_is_better=False), "r2": make_scorer(r2_score)}
refit = "r2"

In [126]:
dtree_regr = DecisionTreeRegressor(random_state=42)

param_grid = {
    "regressor__regressor__min_samples_split": range(2, 101, 20),
    "regressor__regressor__max_depth": [5, 10, 15],
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', dtree_regr)
])

do_regression_grid_search(X_train, X_test, y_train, y_test, pipeline, param_grid=param_grid, scoring=scoring, refit=refit)

MSE: 1286924.00
R2: 0.19
MAPE: 5.22


In [127]:
rf_regr = RandomForestRegressor(random_state=42)

param_grid = {
    "regressor__regressor__n_estimators": range(2, 101, 20),
    "regressor__regressor__max_depth": [5, 10, 15],
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', rf_regr)
])

do_regression_grid_search(X_train, X_test, y_train, y_test, pipeline, param_grid=param_grid, scoring=scoring, refit=refit)

MSE: 876846.00
R2: 0.45
MAPE: 5.39


In [128]:
gbt_regr = GradientBoostingRegressor(random_state=42)

param_grid = {
    "regressor__regressor__n_estimators": range(2, 101, 20),
    "regressor__regressor__max_depth": [5, 10, 15],
    "regressor__regressor__learning_rate": [0.0001, 0.001, 0.01],
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', gbt_regr)
])

do_regression_grid_search(X_train, X_test, y_train, y_test, pipeline, param_grid=param_grid, scoring=scoring, refit=refit)

MSE: 1159028.95
R2: 0.27
MAPE: 6.15


In [129]:
lasso_regr = Lasso(random_state=42)

param_grid = {
    # "regressor__regressor__n_estimators": range(2, 101, 20),
    # "regressor__regressor__max_depth": [5, 10, 15],
    # "regressor__regressor__learning_rate": [0.0001, 0.001, 0.01],
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', lasso_regr)
])

do_regression_grid_search(X_train, X_test, y_train, y_test, pipeline, param_grid=param_grid, scoring=scoring, refit=refit)

MSE: 1587982.89
R2: -0.00
MAPE: 6.65
