In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import OrderedDict 
import time
%matplotlib inline 

#### Reading dataset

In [45]:
loc = './'
mapping=pd.read_csv(loc + 'comments_employee_mapping.csv').dropna()
likes=pd.read_csv(loc + 'comments_likeability.csv').dropna()
attr=pd.read_csv(loc + 'employee_attrition.csv').dropna()
hap=pd.read_csv(loc + 'happiness_level.csv').dropna()

attr['lastParticipationDate']=pd.to_datetime(attr['lastParticipationDate'],infer_datetime_format=True)
mapping['commentDate']=pd.to_datetime(mapping['commentDate'],infer_datetime_format=True)
hap['voteDate']=pd.to_datetime(hap['voteDate'],infer_datetime_format=True)

for d in [mapping,likes,attr,hap]:
    d['id']=d['employee'].map(str) + d['companyAlias']



#### Mapping employee and comment id to range of integers 

In [46]:
employee_ids = np.unique(attr['id'].values)
comment_ids = np.unique(mapping['commentId'].values)

init = np.arange(1, len(employee_ids) + 1)
map_employee = OrderedDict(list(zip(employee_ids, init)))
init = np.arange(1, len(comment_ids) + 1)
map_comment = OrderedDict(list(zip(comment_ids, init)))
    
print(len(map_employee), len(np.unique(employee_ids)))
print(len(map_comment), len(np.unique(comment_ids)))

4377 4377
38993 38993


#### Finding goodness scores of the comments

In [117]:
start=time.time()
list_of_unique_comments=list(map_comment)
goodness_score={}
for i in range(len(list_of_unique_comments)) : 
    comment_id=list_of_unique_comments[i]
    employee_who_made=mapping[mapping['commentId']==comment_id]['id']
    employee_who_made=list(employee_who_made)[0]
    
    current_goodness_score=0
    # assuming employee who made the comment likes it
    if list(attr[attr['id']==employee_who_made]['stillExists'])[-1] : 
        current_goodness_score+=1
    else : 
        current_goodness_score-=1

    like_dislike=likes[likes['commentId']==comment_id][['id','liked','disliked']]
    
    
    for j,like,dislike in zip(like_dislike['id'],like_dislike['liked'],like_dislike['disliked']) : 
        j_stayed=list(attr[attr['id']==j]['stillExists'])[-1]
        if j_stayed and like : 
            current_goodness_score+=1
        if j_stayed and dislike : 
            current_goodness_score-=1
        if (not j_stayed) and like : 
            current_goodness_score-=1
        if (not j_stayed) and dislike : 
            current_goodness_score+=1
    #print(current_goodness_score,float(len(like_dislike['id'])))
    current_goodness_score=current_goodness_score/float(len(like_dislike['id'])+1)
    goodness_score[comment_id]=current_goodness_score    
    if i%1000==0 : 
        print(i,employee_who_made)
print(time.time()-start)

0 307_56aec740f1ef260003e307d6
1000 91_56aec740f1ef260003e307d6
2000 432_56aec740f1ef260003e307d6
3000 271_56aec740f1ef260003e307d6
4000 16_56aec740f1ef260003e307d6
5000 90_56aec740f1ef260003e307d6
6000 39_5370af43e4b0cff95558c12a
7000 125_57bb2f0b3bae540003a8d453
8000 62_567011c035dce00003a07fa4
9000 31_5809cc9eff2ea40003fda44d
10000 129_581b08041a0ef8000308aef6
11000 15_574c423856b6300003009953
12000 38_57908a2622881200033b34d7
13000 219_54e52607e4b01191dc064966
14000 996_54e52607e4b01191dc064966
15000 709_54e52607e4b01191dc064966
16000 267_54e52607e4b01191dc064966
17000 80_54e52607e4b01191dc064966
18000 165_54e52607e4b01191dc064966
19000 139_57dd2d6a4018d9000339ca43
20000 99_57dd2d6a4018d9000339ca43
21000 162_5641f96713664c000332c8cd
22000 146_5641f96713664c000332c8cd
23000 425_5641f96713664c000332c8cd
24000 6_57c4aa7dbb8b5c000396fd3b
25000 4_56e2a905e3b6fe0003e32855
26000 6_56e2a905e3b6fe0003e32855
27000 380_574c5ade56b6300003009965
28000 341_574c5ade56b6300003009965
29000 99_574c5

In [125]:
np.save('comment_goodness_scores',goodness_score)

In [128]:
c=np.load('comment_goodness_scores.npy').item()

In [130]:
c==goodness_score

True

#### Building employee - comment reaction matrix

In [13]:
d = np.zeros([len(employee_ids),len(comment_ids)])

for i in range(len(employee_ids)):
    emp_idx = map_employee[likes['id'][i]]
    com_idx = map_comment[likes['commentId'][i]]
    if likes['liked'][i]==True:
        d[emp_idx, com_idx] = 1
    else:
        d[emp_idx, com_idx] = -1

In [14]:
print(map_employee['51756aec740f1ef260003e307d6'])
print(d[map_employee['2456aec740f1ef260003e307d6'], map_comment['58d0179ae010990004e3806d']])

print(d[map_employee['15256aec740f1ef260003e307d6'], map_comment['58cfefeee010990004e37f60']])
print(d[map_employee['3456aec740f1ef260003e307d6'], map_comment['58d018d7e010990004e38070']])

3226
-1.0
1.0
0.0


#### Creating Columns for Liking/Disliking Comments Based on Goodness Scores

In [None]:
gscore = np.load('comment_goodness_scores.npy').item()
gscore = pd.DataFrame({'commentId': gscore.keys(), 'score': gscore.values()})
gscore.head()

In [None]:
# Note: Takes a lot of time to run!
comment_reac = []
iternum = 0
for emp in np.unique(attr['id']):
    if iternum%100==0: print iternum
    iternum += 1
    emp_reac = pd.DataFrame(data = [[emp, 0, 0, 0, 0]], columns=['id', 'good_like', 'good_dislike', 'bad_like', 'bad_dislike'])
    for _, like in likes[likes.id==emp].iterrows():
        try: score = gscore[gscore.commentId == like.commentId].score.values[0]
        except: score = 0
        if score > 0:             # Good comment
            if like.liked==True:  # Like 
                emp_reac['good_like'] += score 
            else:                 # Dislike
                emp_reac['good_dislike'] -= score                 
        else:                     # Bad comment
            if like.liked==True:   # Like  
                emp_reac['bad_like'] += score 
            else:                 # Dislike
                emp_reac['bad_dislike'] -= score 
    comment_reac.append(emp_reac)
comment_reac = pd.concat(comment_reac, ignore_index=True)

In [None]:
comment_reac.to_pickle('./goodness_classified_matrix.pkl')

In [None]:
#Usage: 
# pd.read_pickle('./goodness_classified_matrix.pkl')
