In [7]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import pickle as pkl
import os

## Concatenated.csv -- 20K unique rows

In [11]:
path = os.getcwd()
data_dir = path + '/data/'

In [12]:
reviews = pd.read_csv(data_dir + 'concatenated.csv',names=["gender","review"])

In [13]:
reviews = reviews.iloc[1:]
reviews = reviews.drop_duplicates(subset='review',keep='first')
reviews.shape

(20010, 2)

In [14]:
reviews.head()

Unnamed: 0,gender,review
0.0,M,Patient and wildly talented. He was meticulous and extremely professional. He did everything imaginable to make us happy. What a job well done. Thank you so very much.
1.0,M,A patient blessing of a saint. Thank you Piere!
2.0,M,"Honestly, only can HIGHLY recommend Piere. Very easy to communicate with, takes direction great and goes above and beyond to make sure his client is happy!"
3.0,M,Piere is very good with intricate designs. He completely read my mind. I am very happy with the work done.
4.0,M,"Piere was a delight to work with and he captured everything I envisioned with just his first draft. Highly talented, and I will use him again. Thanks again Piere! I love the cover for my new book!"


In [15]:
reviews.index = reviews.index.astype(int)

In [16]:
invalid_rows = []
for i in range(reviews.shape[0]):
    #print(reviews.iloc[i].review)
    if isinstance(reviews.iloc[i].review,float):
        invalid_rows.append(i)

In [17]:
invalid_rows

[12768]

In [18]:
reviews = reviews.drop(294689,axis=0)

In [19]:
for i in range(reviews.shape[0]):
    #print(reviews.iloc[i].review)
    if isinstance(reviews.iloc[i].review,float):
        print("oh no")

In [20]:
print(reviews.shape)

(20009, 2)


## Flagged word exploration

In [22]:
flagged_words = pd.read_excel(data_dir + 'flagged_words.xlsx',header=None,names=["phrase"])

In [23]:
flagged_words.head(10)

Unnamed: 0,phrase
0,ABLE
1,able to cope with stress
2,able to delegate
3,able to motivate their employees
4,able to work in teams
5,ABRASIVE
6,ABUSIVE
7,Academic
8,ACADEMICALLY STRONG
9,ACCELERATES OTHERS' CAREERS


In [24]:
for row in flagged_words.phrase:
    print(row)

ABLE
able to cope with stress
able to delegate
able to motivate their employees
able to work in teams
ABRASIVE
ABUSIVE
Academic
ACADEMICALLY STRONG
ACCELERATES OTHERS' CAREERS
ACCOMMODATING
ACHIEVEMENT
ACHIEVING GOALS AND TASKS
ACTING DECISIVELY
ACTING DECISIVELY TO REMOVE IMPEDIMENTS TO WORK PERFORMANCE
ACTIVE
ACTS AS A LEADER
ADAPTABLE
Adventurer
ADVENTUROUS
ADVOCATE
AFFECTED
AFFECTING OTHERS
AFFECTING OTHERS IN POSITIONS OF HIGHER RANK
AFFECTIONATE
Aggressive
AGREEABLE
Airhead
ALOOF
ALTRUISTIC
always WILLING TO HELP
Ambitious
ANALYTICAL
ANALYZING
ANGRY
ANXIOUS
APATHETIC
APPEAL TO EMOTION
APPRECIATIVE
APPROACHABLE
ARGUMENTATIVE
ARROGANT
ARTICULATE
ASK OTHERS BEFORE MAKING DECISIONS
ASK OTHERS TO FOLLLOW RULES
ASKS THE GROUP MEMBERS TO FOLLOW RULES
ASSERTIVE
ASSISTING
ATTRACTIVE
AUTHORIZING OTHERS
AUTOCRATIC
AUTONOMOUS
AUTONOMY
battle
BEAT AROUND THE BUSH
BENEVOLENT
BLUNT
boast
boast too much
BOASTFUL
Bolshy
BOSSY
BOY
BRAGGING
Brave
Breadwinner
Breathless
Bright
Brilliant
BRUSQUE
bubb

## Filter reviews that contain flagged words

In [25]:
count = 0
for review in reviews.review:
    #print(review)
    for phrase in flagged_words.phrase:
        #print(phrase)
        if phrase in review:
            count+=1
count

5040

## More Advanced Filtration

In [26]:
import nltk
import random
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [27]:
count = 0
words = set()
for phrase in flagged_words.phrase:
    if len(phrase.split())==1:
        single_word = phrase.lower()
        lemmatized_word = wordnet_lemmatizer.lemmatize(single_word)
        if lemmatized_word in words:
            continue
        words.add(lemmatized_word)
        count+=1
print(count)
for i, val in enumerate(random.sample(words, 10)):
    print(val)

570
pushy
comfortable
secretive
task
overcome
adaptable
orderly
distant
supporting
cheerful


In [31]:
words = list(words)

In [33]:
pkl.dump(words, open(data_dir + 'flagged_words_single.p','wb'))

In [54]:
count = 0
flagged_reviews = []
for review in reviews.review:
    #print(review)
    for word in words:
        #print(phrase)
        if word in review.lower():
            flagged_reviews.append(review)
            count+=1
            break
count

14100

In [56]:
flagged_reviews[:10]

['Patient and wildly talented. He was meticulous and extremely professional. He did everything imaginable to make us happy. What a job well done. Thank you so very much.',
 'Honestly, only can HIGHLY recommend Piere. Very easy to communicate with, takes direction great and goes above and beyond to make sure his client is happy!',
 'Piere was a delight to work with and he captured everything I envisioned with just his first draft. Highly talented, and I will use him again. Thanks again Piere! I love the cover for my new book!',
 "Piere is a fantastic artist and extremely pleasant to work with. He was very responsive and easily understood what I was looking for. I can't recommend him enough.",
 'Outstanding work!! Very easy to work with!',
 'This artist was great to work with. Im happy with his art and plan to work with him again hopefully soon.',
 'Incredible style, excellent communicator and fantastic design choices!',
 "Piere was wonderfully patient on this project. There were several

---
# Performance Analysis Review

### TODO: wrangle the reviews out of the corresponding columns

In [None]:
usecols = ['Person 1: please comment on job performance during the last year.',
       'Person 1: please comment on how this person has worked with others.',
       'Person 1: please suggest areas of improvement and targets for the next year.',
       'Person 2: please comment on job performance during the last year.',
       'Person 2: please comment on how this person has worked with others.',
       'Person 2: please suggest areas of improvement and targets for the next year.',
       'Person 3: please comment on job performance during the last year.',
       'Person 3: please comment on how this person has worked with others.',
       'Person 3: please suggest areas of improvement and targets for the next year.']

In [None]:
PRA = pd.read_csv('Performance Review Analysis.csv',header=0)

In [None]:
PRA.head()

In [None]:
PRA.columns

## NLTK -- not working yet

In [None]:
from nltk.parse.corenlp import CoreNLPDependencyParser

In [None]:
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

In [None]:
parse, = dep_parser.raw_parse("This is a sunny day")

## Stanford NLP

#### need to wrangle this more: use indices to identify the noun

#### uncomment below to install package, download English

In [None]:
# !pip install stanfordnlp
import stanfordnlp
# stanfordnlp.download('en')

In [None]:
nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos,lemma,depparse', lang='en')
doc = nlp("Dorarpol is an extremely talented designer. Not only does he come up with good ideas on his own, he takes his client's feedback into account. All the work he has done for us has been stellar. I can not speak highly enough of him.")

print(*[f"index: {word.index.rjust(2)}\tword: {word.text.ljust(11)}\tgovernor index: {word.governor}\tgovernor: {(doc.sentences[0].words[word.governor-1].text if word.governor > 0 else 'root').ljust(11)}\tdeprel: {word.dependency_relation}" for word in doc.sentences[0].words], sep='\n')

In [None]:
doc