In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS, CountVectorizer
import re
from random import seed
from sklearn.cluster import KMeans

In [3]:
def get_word_window(sentence, phrase, w_left=0, w_right=0):
    w_lst = sentence.split()
    for i,word in enumerate(w_lst):
        if bool(re.search(phrase,word)):
            left = max(0, i-w_left)
            right = min(len(w_lst), i+w_right)
            break
    try:
        return w_lst[left:right]
    except UnboundLocalError:
        return [""]
def word(x):
    return terms[x]

In [4]:
#stopword are standard english words to ignore + manually selected phrases which appeared in too many reviews to be useful
stop_words = ENGLISH_STOP_WORDS.union(set(["good", "great", "place"]))
data = pd.read_csv('PSC_Training_Dataset.csv')
corpus= list(data["answer"])

In [5]:
clusters = 5
#Makes a dictionary of 2 to 4 word phrases and stores a count matrix of words in X
tfid = TfidfVectorizer(input='content', lowercase=True, stop_words=stop_words, min_df = 2, ngram_range=(2, 4), max_features=2000)
X = tfid.fit_transform(corpus)
# creates clusters of most similar words and prints out top 5 words for each company
km = KMeans(n_clusters=clusters, init='k-means++', tol=1e-6, n_init=100, random_state = 1)
km.fit(X)

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

print km.inertia_
terms = tfid.get_feature_names()
df = pd.DataFrame(np.transpose(order_centroids[:,:10]))

df = df.applymap(word)
df

1669.6519778


Unnamed: 0,0,1,2,3,4
0,work life,people work,health insurance,work environment,upper management
1,life balance,job security,health insurance expensive,long hours,upper management does
2,work life balance,large company,3000 deductible,pressure work environment,upper management don
3,standard work,furloughs layoffs,insurance expensive,pressure work,management does
4,terrible work,big company,high deductible,years time,direct management
5,promotionshorrible insurance,short term,health insurance isn,work long,management don
6,flexible schedule,company work,insurance isn,work long hours,corporate management
7,life balance poor,care employees,insurance terrible,pay work,paid bills adversarial
8,work life balance poor,cost cutting,dental insurance,id management,paid bills adversarial environment
9,balance poor,layoffs furloughs,super expensive,stressful work,people upper


In [6]:
##manually selected themes based off 10 closest words in each cluster
theme = ["Balance", "People", "Insurance", "Environment", "Management" ]
groups = ["balanc\w*","(people|employee\w*)","(insurance\w*|deductible\w*)","(environment\w*|hours)","manag\w*"]

##additional stopwords for each theme that will
special_stop_words = [["work", "life", "balance"], 
                      ["people", "employee", "employees"],
                      ["health", "dental", "insurance", "benefits", "deductible"],
                      ["work", "environment", "hours"], 
                      ["management", "manager","managing", "upper", "senior", "managers"]]

In [10]:
df.columns = theme
##export top words in each theme CSV to be used in PBI file
df.to_csv("top_terms.csv", index = False)

##get top 10 words from subset of reviews containing each theme
for i, g in enumerate(groups):
    comments = [x for x in corpus if bool(re.search(g,x))]                       
    excerpt = [' '.join(get_word_window(x,g,3,3)) for x in comments]
    stop_words = ENGLISH_STOP_WORDS.union(set(["work", "place"]), set(special_stop_words[i]))
    cv = CountVectorizer(input='content', lowercase=True, stop_words=stop_words, ngram_range=(1, 2), max_features=2000)
    X = cv.fit_transform(excerpt)

    sums = X.sum(axis=0)
    terms = cv.get_feature_names()
    # connecting term to its sums frequency
    data = []
    for col, term in enumerate(terms):
        data.append( (theme[i], term, sums[0,col] ))

    ranking = pd.DataFrame(data, columns=["theme",'term','rank'])
    ranking.sort_values('rank',inplace=True, ascending=False)
    
    #export top words for each theme to be used in PBI
    ranking.head(10).to_csv("top_words_%s.csv" %theme[i], index = False)