In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from numpy.linalg import svd
%matplotlib inline 

In [6]:
#load data
mapping=pd.read_csv('comments_employee_mapping.csv').dropna()
likes=pd.read_csv('comments_likeability.csv').dropna()
attr=pd.read_csv('employee_attrition.csv').dropna()
hap=pd.read_csv('happiness_level.csv').dropna()

#read dates as pandas date-type
attr['lastParticipationDate']=pd.to_datetime(attr['lastParticipationDate'],infer_datetime_format=True)
mapping['commentDate']=pd.to_datetime(mapping['commentDate'],infer_datetime_format=True)
hap['voteDate']=pd.to_datetime(hap['voteDate'],infer_datetime_format=True)

#merge employee number and company alias into id
for d in [mapping,likes,attr,hap]:
    d['id']=d['employee'].map(str)+d['companyAlias']

#linearize dates
attr['ordinal']=[x.toordinal() for x in attr['lastParticipationDate']]
mapping['ordinal']=[x.toordinal() for x in mapping['commentDate']]
hap['ordinal']=[x.toordinal() for x in hap['voteDate']]



In [22]:
l = np.array(np.load('likes_matrix_sparse_arranged.npy', encoding='latin1')[0].todense())
# l = np.load('likes_matrix_sparse_arranged.npy')[0]
c = np.array(np.load('comment_matrix_sparse_arranged.npy', encoding='latin1')[0].todense())
h=np.load('HappinessFeature.npy').tolist()
h=np.array([h[e] for e in attr.id])

g=np.load('comment_goodness_scores.npy', encoding='latin1').tolist()
l_w=np.array(np.load('likes_matrix_sparse_arranged.npy', encoding='latin1')[0].todense())
com_dict=np.load('likes_matrix_sparse_arranged.npy', encoding='latin1')[1]
for k in g:
    try:l_w[:,com_dict[g]]*=g[k]
    except:pass
l_w=l_w/((l!=0).sum(1) +1 )[:,np.newaxis]
l_w=l_w[:,(l!=0).sum(0)>5]


In [23]:
#data=np.hstack([l,c,pd.get_dummies(attr.companyAlias),(h)[:,np.newaxis],attr.numVotes[:,np.newaxis]])
data=np.hstack([l_w,pd.get_dummies(attr.companyAlias),((attr.ordinal-attr.ordinal.mean())/attr.ordinal.std())[:,np.newaxis],(h/(attr.numVotes+1))[:,np.newaxis],attr.numVotes[:,np.newaxis]])

X_train, X_test, y_train, y_test = train_test_split(
    data,attr.stillExists,stratify=attr.stillExists, test_size=0.33, random_state=42)


In [24]:
pca = PCA(n_components=1000)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [25]:
svd = TruncatedSVD(n_components=100)
svd.fit(X_train)
X_train_svd = svd.transform(X_train)
X_test_svd = svd.transform(X_test)


### Support Vector Machines

In [52]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [62]:
clf_svm = SVC(C=2500, gamma=0.0001).fit(X_train_pca, y_train)

In [63]:
print("Test score = {:.4f}".format(clf_svm.score(X_test_pca, y_test)))
print("Training score = {:.4}".format(clf_svm.score(X_train_pca, y_train)))

Test score = 0.8834
Training score = 0.8838


##### Grid search for best parameters

In [55]:
param_grid = {"C": [100, 1000, 2500, 5000, 10000, 20000], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]}
gs = GridSearchCV(SVC(), param_grid=param_grid, scoring='accuracy', cv=4)
gs = gs.fit(X_train_pca, y_train) 
print(gs.best_params_)

{'C': 20000, 'gamma': 0.0001}


In [56]:
clf_best_svm = gs.best_estimator_
clf_best_svm.fit(X_train_pca, y_train)
print(clf_best_svm.score(X_test_pca, y_test))

0.8868312757201646


In [59]:
clf_svm = SVC(C=20000, gamma=0.0001).fit(X_train_pca, y_train)

In [60]:
print("Test score = {:.4f}".format(clf_svm.score(X_test_pca, y_test)))
print("Training score = {:.4}".format(clf_svm.score(X_train_pca, y_train)))

Test score = 0.8868
Training score = 0.9027


#### SVM for data that are not reduced

In [32]:
clf_svm = SVC(C=1000).fit(X_train, y_train)

In [33]:
print("Test score = {:.4f}".format(clf_svm.score(X_test, y_test)))
print("Training score = {:.4}".format(clf_svm.score(X_train, y_train)))

Test score = 0.8793
Training score = 0.8828


#### SVM with SVD reduced data

In [64]:
clf_svm_svd = SVC(C=5000, gamma=0.0001).fit(X_train_svd, y_train)

In [65]:
print("Test score = {:.4f}".format(clf_svm_svd.score(X_test_svd, y_test)))
print("Training score = {:.4}".format(clf_svm_svd.score(X_train_svd, y_train)))

Test score = 0.8861
Training score = 0.8878
