# Attrition Prediction Using Node Embeddings
#### Instead of using onehot, label or ordinal encoders, we will try to represent the data by node embeddings taken from a networkx graph.

In [None]:
!pip install node2vec

In [None]:
from pathlib import Path
from typing import List, Dict, Set, Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

from pandas_profiling import ProfileReport

attrition_data = '../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv'
df_attrition = pd.read_csv(attrition_data)
df_attrition

In [None]:
%%time
profile = ProfileReport(df_attrition, title='Attirition profiling')

In [None]:
# %%time
# profile.to_widgets()

### Alternative way of displaying profiling report

In [None]:
# %%time
# profile.to_notebook_iframe()

#### From the pandas profiling report (Warnings Tab), there are highly correlated columns as well as all constant valued columns. I created a dictionary separating columns based on the warnings.

In [None]:
target_column = 'Attrition'
selected_columns = ['EmployeeNumber', 'BusinessTravel', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime', 'Attrition']

columns_grouped = {
    'constant_columns' : ['EmployeeCount', 'Over18', 'StandardHours'],
    'correlated_columns' : ['JobLevel', 'TotalWorkingYears', 'PerformanceRating', 'YearsWithCurrManager',    
                            'YearsInCurrentRole', 'EducationField', 'StockOptionLevel', 'Department'],
    'null_columns' : ['YearsSinceLastPromotion'],
    'other_columns' : ['MonthlyRate', 'DailyRate', 'HourlyRate'],
}

# Remap certain column values to avoid name confusions for later use. 
new_column_values = {
    'OverTime':{'Yes':'yes_overtime', 'No':'no_overtime'}, 
    'BusinessTravel':{'Travel_Rarely':'Rarely', 'Travel_Frequently':'Frequently', 'Non-Travel':'NonTravel'}
}

df_attrition = df_attrition.replace(new_column_values)

In [None]:
df_attrition = df_attrition.loc[:, selected_columns]
df_attrition

In [None]:
df_attrition.loc[:, target_column]

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

onehot_encoder = OneHotEncoder( sparse = False, drop = 'if_binary', dtype = np.int32 )
ordinal_encoder = OrdinalEncoder()
label_encoder = LabelEncoder() # for target encoding

target_column_encoded = label_encoder.fit_transform(df_attrition[target_column])

onehot_columns = ['Gender','JobRole', 'MaritalStatus', 'OverTime']
categorical_columns = ['Attrition', 'BusinessTravel']

onehot_array = onehot_encoder.fit_transform( df_attrition.loc[:, onehot_columns] )
onehot_encoded = pd.DataFrame(onehot_array, columns=onehot_encoder.get_feature_names( onehot_columns ))

ordinal_array = ordinal_encoder.fit_transform( df_attrition.loc[:, categorical_columns] )
ordinal_encoded = pd.DataFrame(ordinal_array, columns=categorical_columns)

df_attrition_encoded = pd.concat([onehot_encoded, ordinal_encoded], axis=1)
df_attrition_encoded = df_attrition_encoded.astype('int64')
df_attrition_encoded


# GRAPH BUILDER

In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from typing import List
from time import time

class GraphLoader:
    def __init__(self):
        self.graph = None
        self.title = None
        
    def build_graph(self, 
                    dataframe: pd.DataFrame, 
                    columns: List, 
                    edge_list: List,
                    verbose: bool = True,
                    title: str = 'Unnamed'):
        self.title = title
        t0 = time()
        self.graph = nx.Graph(name = self.title)

        # Add Nodes to the graph.
        for column in columns:
            self.graph.add_nodes_from(dataframe[column].values, label=column)

        # Add remaining columns as Node attributes. Optional
        remaining = dataframe.columns.difference(columns)
        for node, data in self.graph.nodes(data=True):
            if data["label"] == "EmployeeNumber":
                self.graph.nodes[node]["attributes"] = dataframe.loc[dataframe["EmployeeNumber"] == int(node), remaining].squeeze().to_dict()

        # Add Edges.
        for _, row in dataframe.loc[:, columns].iterrows():
            for edge in edge_list:
                self.graph.add_edge(row[edge[0]], row[edge[1]])
        
        if verbose:
            print(f"FINISHED in {np.round(time() - t0, 3)} seconds.")
            print(nx.info(self.graph))
        
        return self.graph

    def draw_graph(self, graph: nx.Graph, node_colors: dict, node: str = None, radius: int = 1) -> None:
        def assign_colors(graph: nx.Graph) -> List:
            # Assign Colors to nodes
            colors = []
            for n, data in graph.nodes(data=True):
                node = data["label"]
                colors.append(node_colors.get(node, "black"))
            return colors

        f = plt.figure(figsize = (20,12), facecolor="darkgray")
        ax = f.add_subplot()

        if not node:    
            plt.title(self.title)
        else:
            plt.title(f"Ego Graph around the node {node}, (radius={radius})")
            graph = nx.ego_graph(graph, node, radius = radius)
        
        colors = assign_colors(graph)
        nx.draw_networkx(graph, node_size = 800, node_color = colors, with_labels = True)
        # Add an empty plot to set custom legends
        from matplotlib.lines import Line2D
        ax.scatter([],[])
        legend_elements = [
            Line2D([0], [0], marker='o', color='w', label='employee no', markerfacecolor = node_colors['EmployeeNumber'], markersize=15),
            Line2D([0], [0], marker='o', color='w', label='travel', markerfacecolor = node_colors['BusinessTravel'], markersize=15),
            Line2D([0], [0], marker='o', color='w', label='gender', markerfacecolor = node_colors['Gender'], markersize=15),
            Line2D([0], [0], marker='o', color='w', label='job role', markerfacecolor = node_colors['JobRole'], markersize=15),
            Line2D([0], [0], marker='o', color='w', label='marital status', markerfacecolor = node_colors['MaritalStatus'], markersize=15),
        ]
        ax.legend(handles=legend_elements, loc='best')
        plt.show()

In [None]:
nodes = selected_columns[:-1] 
edges = [("EmployeeNumber","BusinessTravel"),
         ("EmployeeNumber","Gender"),
         ("EmployeeNumber","JobRole"),
         ("EmployeeNumber","MaritalStatus"),
         ("EmployeeNumber","OverTime")]

node_colors = {
    "EmployeeNumber": "dodgerblue", 
    "BusinessTravel":"lightgreen", 
    "Gender":"tan", 
    "JobRole":"salmon",
    "MaritalStatus":"darkcyan",
    "OverTime":"peru"
}

In [None]:
graph_loader = GraphLoader()
demo_graph = graph_loader.build_graph(
            dataframe = df_attrition.loc[:5, selected_columns[:-1]],
            columns = nodes, 
            edge_list = edges,
            verbose = True,
            title = 'Employe Attrition graph with only 5 samples of data'
        )
# graph_loader.draw_graph(demo_graph, node_colors, node = 5, radius = 2) # show only the nodes that are at a distance of 2 edges from the employee 5.
graph_loader.draw_graph(demo_graph, node_colors) # Show the whole graph

In [None]:
graph = graph_loader.build_graph(
            dataframe = df_attrition.loc[:, selected_columns[:-1]],
            columns = nodes, 
            edge_list = edges,
            verbose = True,
            title = 'Employe Attrition Graph'
        )

In [None]:
import gensim
import networkx as nx
import pickle
from node2vec import Node2Vec
from pathlib import Path

CWD = Path().cwd()
EMBEDDINGS_DIR = CWD / 'embeddings'
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
SEED = 12

class VectorizerConfig:
    dimensions = 64
    walk_length = 30
    num_walks = 50
    window = 10
    min_count = 1
    batch_words = 4

class NodeEmbedding:

    def __init__(self) -> None:
        self.vectorizer = None
        self.model = None
        
    def generate_random_walks(self, graph: nx.Graph, **params) -> None:
        self.vectorizer = Node2Vec(graph, **params)
        # return self.vectorizer

    def fit(self, **params) -> gensim.models.Word2Vec:
        if self.vectorizer is None:
            raise Exception("No random walks. Generate Random walks by calling generate_random_walks() method first.")
        self.model = self.vectorizer.fit(**params)
        
        return self.model

    def save_model(self, model: gensim.models.Word2Vec, save_to: Path = EMBEDDINGS_DIR, prefix: str = None) -> None:
        d = VectorizerConfig.dimensions
        w = VectorizerConfig.walk_length
        n = VectorizerConfig.num_walks

        embeddings_filename = f"{prefix}_embeddings_{d}_{w}_{n}.txt"
        model_filename = f"{prefix}_model_{d}_{w}_{n}.pkl"
        # Save only the embeddings in a txt file.
        self.model.wv.save_word2vec_format(str(EMBEDDINGS_DIR/embeddings_filename))
        # Save the entire model.
        self.model.save(str(EMBEDDINGS_DIR/model_filename))
        print(f"Model and embeddings saved to: {str(EMBEDDINGS_DIR/model_filename)}")

    def load_model(self, model_filename: str = None, load_from: Path = EMBEDDINGS_DIR) -> gensim.models.Word2Vec:
        if Path(EMBEDDINGS_DIR / model_filename).exists():
            print("Loaded Model: ", model_filename)
            with Path(EMBEDDINGS_DIR / model_filename).open(mode="r+b") as file:
                self.model = pickle.load(file)
        else:
            raise FileNotFoundError(f"NOT found: {EMBEDDINGS_DIR / model_filename}")
        
        return self.model


In [None]:
import multiprocessing

embedder = NodeEmbedding()

In [None]:
embedder.generate_random_walks(
    graph,
    dimensions = VectorizerConfig.dimensions,
    walk_length = VectorizerConfig.walk_length,
    num_walks = VectorizerConfig.num_walks,
    workers = multiprocessing.cpu_count()
)

In [None]:
%%time
model = embedder.fit(
    window = VectorizerConfig.window,
    min_count = VectorizerConfig.min_count,
    batch_words = VectorizerConfig.batch_words
)

In [None]:
embedder.save_model(model, save_to = EMBEDDINGS_DIR, prefix = "attrition")

In [None]:
model2 = embedder.load_model("attrition_model_64_30_50.pkl", load_from = EMBEDDINGS_DIR)

In [None]:
with open(EMBEDDINGS_DIR / 'attrition_embeddings_64_30_50.txt', 'r') as embeddings_file:
    embeddings = embeddings_file.readlines()

In [None]:
# first element in the embeddings txt are the node counts and the embedding dimension, respectively.
embeddings[:5]

In [None]:
def align_features_and_target(df: pd.DataFrame, embeddings_file: str = None):
    vectors = []
    with Path(EMBEDDINGS_DIR / embeddings_file).open(mode="r") as file:
        results = file.readlines()
        for person in df["EmployeeNumber"].values:
            for line in results[1:]:
                if line.split()[0] == str(person):
                    vectors.append(line.split()[1:])
    
    return np.array(vectors).astype(np.float64)

In [None]:
%%time
embeddings_filename = "attrition_embeddings_64_30_50.txt"
vectors2 = align_features_and_target(df_attrition, embeddings_file = embeddings_filename)


In [None]:
vectors2.shape

In [None]:
# node_targets = np.array(list(map(lambda label: 1 if label == "Yes" else 0, df_attrition["Attrition"])))
node_targets = df_attrition_encoded['Attrition'].values
node_targets.shape


In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_curve, r2_score

print(f'Original target value counts: {Counter(node_targets)}')
print(f'Train vectors shape: ', vectors2.shape)

### OVER SAMPLING ###
print('\n<--------------OVERSAMPLING---------------->\n')
# over_sampler = SMOTE(sampling_strategy = 1.0, k_neighbors = 5, random_state = 12)
# features, target = over_sampler.fit_resample(vectors2, node_targets)
x_train, x_test, y_train, y_test = train_test_split(
    vectors2, 
    node_targets, 
    test_size = 0.25, 
    random_state = 12, 
    stratify = node_targets
)

print(f'After oversampling target counts: {Counter(target)}')
print(f'feature train shape After oversampling: ', features.shape)

classifiers = {
        'LogisticReg': LogisticRegression(), 
        'SVC': SVC(), 
        'SGD': SGDClassifier(), 
        'GBC': GradientBoostingClassifier(),
        'kNN': KNeighborsClassifier()
}

scores = dict()
for name, classifier in classifiers.items():
        print(f'\n<------------- MODEL: {name} ----------->')
#         scores[name] = cross_validate(classifier, x_train, y_train, cv = 10, scoring = 'f1', return_train_score = False)
        classifier.fit(x_train, y_train)
        print(classification_report(y_test, classifier.predict(x_test), zero_division = 0))
        print(f'<------------- END ----------->\n')


In [None]:
(309*0.91 + 59*0.21) / (309+59)

In [None]:
%%time
target = df_attrition_encoded['Attrition'].values
features = df_attrition_encoded.drop(columns=['Attrition'])

X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size = 0.25, random_state = 12, stratify = target)

classifiers = {
        'LogisticReg': LogisticRegression(), 
        'SVC': SVC(), 
        'SGD': SGDClassifier(), 
        'GBC': GradientBoostingClassifier(),
        'kNN': KNeighborsClassifier()
}

for name, classifier in classifiers.items():
        print(f'<------------- MODEL: {name} ----------->')
        classifier.fit(X_train, Y_train)
        print(classification_report(Y_test, classifier.predict(X_test), zero_division = 0))
        print(f'<------------- END ----------->\n')

In [None]:
# Edge Embedding
from node2vec.edges import HadamardEmbedder
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
edges_embs[('1', '2')]



In [None]:
edges_kv = edges_embs.as_keyed_vectors()
edges_kv.most_similar(str(('1', '2')))


In [None]:
#Dimensionality reduction
from sklearn.manifold import TSNE
from matplotlib import patches
import seaborn

seaborn.set_style('whitegrid')

# for node, data in G_karate.nodes(data=True):
#     print(node, data)
embeddings = np.array([model.wv[node] for node in graph.nodes])
tsne = TSNE(n_components=2, random_state=7, perplexity=15)
embeddings_2d = tsne.fit_transform(embeddings)
figure = plt.figure(figsize=(11, 9))

ax = figure.add_subplot(111)

ax.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])

# Create team patches for legend
# team_patches = [mpatches.Patch(color=color, label=team) for team, color in team_colors.items()]
# ax.legend(handles=team_patches);


In [None]:
## Pathlib Tutorial
from pathlib import Path, PurePath, PosixPath, PurePosixPath, PureWindowsPath
p = Path('../data')

for file in p.iterdir():
    print(file)

In [None]:
print('Exists: ', p.exists())
print('is dir: ', p.is_dir())
print('is file: ', p.is_file())

In [None]:
list(p.glob('**/*.csv'))

In [None]:
print('CWD: ', Path.cwd())
print('Home: ', Path.home())

In [None]:
def test_something(**kwargs):
    print(type(kwargs))
    print(kwargs)

test = {'  ' "x'": ['te't", 'ke't", 'mest"],'
    "'ar'et":'"haha"'
'  '"'2"'['t", "h"]
}
test_something(a=15, b=21, c=100, d=15)