### 1: Import the data

In [1]:
import numpy as np
import pandas as pd

import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
#assume that this data is retrieved from the json post request
df = pd.read_csv("webScrapped.csv", skipinitialspace=True)

In [3]:
df.head()

Unnamed: 0,label,review
0,neutral,I applied online. I interviewed at ST Engineer...
1,negative,The process took about 2 weeks. HR did not inf...
2,negative,I was given an interview date and time after s...
3,neutral,I applied online. I interviewed at ST Engineer...
4,negative,Overall HR did good job to arrange the intervi...


In [4]:
df.drop("label", 1, inplace=True)

In [5]:
print(len(df))
print(df)

17
                                               review
0   I applied online. I interviewed at ST Engineer...
1   The process took about 2 weeks. HR did not inf...
2   I was given an interview date and time after s...
3   I applied online. I interviewed at ST Engineer...
4   Overall HR did good job to arrange the intervi...
5   I applied online. The process took 2+ months. ...
6   After an online application, they review my CV...
7   Casual chat in warm office, hiring manager hig...
8   I applied online. I interviewed at ST Engineer...
9   Applied through company's career website- HR p...
10  The process took 3+ months. I interviewed at S...
11  Was asked to attend first round and second rou...
12  I applied online. The process took 2 weeks. I ...
13  Hiring process is fast, gotten offer immediate...
14  I applied through a staffing agency. The proce...
15  Two interviews, first is panel while second is...
16  I applied online. I interviewed at ST Engineer...


### 2: Clean the data

In [6]:
#Check for NaN values
print(df.isnull().sum())
df.dropna(inplace=True)

review    0
dtype: int64


In [7]:
print(len(df))

17


In [8]:
#remove white spaces and empty strings
blanks = []

for index, row in df.iterrows():
    review = row['review']
    if type(review) == str:
        if (review.isspace()):
            blanks.append(index)

print(blanks)

df.drop(index=blanks, inplace=True)
print(len(df))

[]
17


In [9]:
#trim the strings    
df = df.applymap(lambda cell: cell.strip() if (type(cell) == str) else cell)

In [10]:
#remove punctuation
def remove_punctuation(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if token.is_alpha or token.is_digit]
    return " ".join(tokens)
    
df['review'] = df['review'].apply(remove_punctuation)

### 3: Vader Sentiment

In [11]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\eugen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [12]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [13]:
print(df.iloc[1, 0])
print(sid.polarity_scores(df.iloc[1, 0]))
print(sid.polarity_scores(df.iloc[1, 0])['compound'])

The process took about 2 weeks HR did not inform there will be a test The job scope was very generalized Interviewers were not interested in hiring and no friendly vibes at all Really untrained and gave an awkward silence
{'neg': 0.14, 'neu': 0.786, 'pos': 0.074, 'compound': -0.2163}
-0.2163


In [14]:
df['compound_score'] = df["review"].apply(lambda text : sid.polarity_scores(text)['compound'])
df.head()

Unnamed: 0,review,compound_score
0,I applied online I interviewed at ST Engineering,0.0
1,The process took about 2 weeks HR did not info...,-0.2163
2,I was given an interview date and time after s...,-0.7778
3,I applied online I interviewed at ST Engineeri...,0.0
4,Overall HR did good job to arrange the intervi...,-0.7688


#### 3b: Convert score to string

In [15]:
def convert_score_to_sentiment(score):
    if score > 0:
        return "positive"
    elif score == 0:
        return "neutral"
    else:
        return "negative"

In [16]:
df['sentiment'] = df['compound_score'].apply(lambda score: convert_score_to_sentiment(score))
df.head(10)

Unnamed: 0,review,compound_score,sentiment
0,I applied online I interviewed at ST Engineering,0.0,neutral
1,The process took about 2 weeks HR did not info...,-0.2163,negative
2,I was given an interview date and time after s...,-0.7778,negative
3,I applied online I interviewed at ST Engineeri...,0.0,neutral
4,Overall HR did good job to arrange the intervi...,-0.7688,negative
5,I applied online The process took 2 months I i...,0.0,neutral
6,After an online application they review my CV ...,0.2732,positive
7,Casual chat in warm office hiring manager high...,0.8201,positive
8,I applied online I interviewed at ST Engineering,0.0,neutral
9,Applied through company career HR process appl...,0.4215,positive


In [17]:
df.drop("compound_score", 1, inplace=True)
df.head()

Unnamed: 0,review,sentiment
0,I applied online I interviewed at ST Engineering,neutral
1,The process took about 2 weeks HR did not info...,negative
2,I was given an interview date and time after s...,negative
3,I applied online I interviewed at ST Engineeri...,neutral
4,Overall HR did good job to arrange the intervi...,negative


In [24]:
docList = []

for index, row in df.iterrows():
    doc = {
        "textContent": row["review"],
        "sentiment": row["sentiment"]
    }
    docList.append(doc)

In [27]:
print(docList[:5])

[{'textContent': 'I applied online I interviewed at ST Engineering', 'sentiment': 'neutral'}, {'textContent': 'The process took about 2 weeks HR did not inform there will be a test The job scope was very generalized Interviewers were not interested in hiring and no friendly vibes at all Really untrained and gave an awkward silence', 'sentiment': 'negative'}, {'textContent': 'I was given an interview date and time after submitting resume Interview got postponed and there was no follow up even after emailing HR about it A very bad experience', 'sentiment': 'negative'}, {'textContent': 'I applied online I interviewed at ST Engineering Singapore in March 2020', 'sentiment': 'neutral'}, {'textContent': 'Overall HR did good job to arrange the interview however the interviewer is the worst one I have met most questions asked were not relevant to job itself the interviewer just tried very hard to fail you Very disapotined that why ST has those kind of people in management role', 'sentiment': '