### 1: Import the data

In [1]:
import numpy as np
import pandas as pd

import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
#assume that this data is retrieved from the json post request
df = pd.read_csv("webScrapped.csv", skipinitialspace=True)

In [3]:
df.head()

Unnamed: 0,label,review
0,neutral,I applied online. I interviewed at ST Engineer...
1,negative,The process took about 2 weeks. HR did not inf...
2,negative,I was given an interview date and time after s...
3,neutral,I applied online. I interviewed at ST Engineer...
4,negative,Overall HR did good job to arrange the intervi...


In [4]:
print(len(df))
print(df)

17
        label                                             review
0     neutral  I applied online. I interviewed at ST Engineer...
1    negative  The process took about 2 weeks. HR did not inf...
2    negative  I was given an interview date and time after s...
3     neutral  I applied online. I interviewed at ST Engineer...
4    negative  Overall HR did good job to arrange the intervi...
5     neutral  I applied online. The process took 2+ months. ...
6     neutral  After an online application, they review my CV...
7   positive   Casual chat in warm office, hiring manager hig...
8     neutral  I applied online. I interviewed at ST Engineer...
9   positive   Applied through company's career website- HR p...
10    neutral  The process took 3+ months. I interviewed at S...
11   negative  Was asked to attend first round and second rou...
12    neutral  I applied online. The process took 2 weeks. I ...
13  positive   Hiring process is fast, gotten offer immediate...
14    neutral  I appli

In [5]:
word1 = df.loc[13, 'label']
word2 = df.loc[9, 'label']
print(len(word1))


9


### 2: Clean the data

In [6]:
#Check for NaN values
print(df.isnull().sum())
df.dropna(inplace=True)

label     0
review    0
dtype: int64


In [7]:
print(len(df))

17


In [8]:
#remove white spaces and empty strings
blanks = []

for index, row in df.iterrows():
    label = row['label']
    review = row['review']
    if label.isspace():
        blanks.append(index)
    if type(review) == str:
        if (review.isspace()):
            blanks.append(index)

print(blanks)

df.drop(index=blanks, inplace=True)
print(len(df))

[]
17


In [9]:
#trim the strings    
df = df.applymap(lambda cell: cell.strip() if (type(cell) == str) else cell)

In [10]:
#remove punctuation
def remove_punctuation(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if token.is_alpha or token.is_digit]
    return " ".join(tokens)
    
df['review'] = df['review'].apply(remove_punctuation)

### 3: Encode the data

In [11]:
sentiment_mapping = {
    'positive': 1,
    'neutral': 0,
    'negative': -1,
}

In [12]:
df['label'] = df['label'].map(sentiment_mapping)

In [13]:
df.head()

Unnamed: 0,label,review
0,0,I applied online I interviewed at ST Engineering
1,-1,The process took about 2 weeks HR did not info...
2,-1,I was given an interview date and time after s...
3,0,I applied online I interviewed at ST Engineeri...
4,-1,Overall HR did good job to arrange the intervi...


### 4: Split the data into training and testing sets

In [14]:
from sklearn.model_selection import train_test_split
X = df['review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(11,)
(6,)
(11,)
(6,)


### 5: Create the lemmatizer

In [15]:
from spacy.lang.en import English
#print(type(lemmatizer), lemmatizer)
lemmatizer = English()

In [16]:
def lemmat(text):    
    tokens = lemmatizer(text)
    results = [token.lemma_ for token in tokens]
    return results

In [17]:
#test the lemmat()
print(df.loc[0, 'review'])
lematised_review = lemmat(df.loc[0, 'review'])
print(lematised_review)

I applied online I interviewed at ST Engineering
['I', 'applied', 'online', 'I', 'interviewed', 'at', 'ST', 'Engineering']


### 6: Create the stop words vocab and lemmatize it too

In [18]:
stop_words = nlp.Defaults.stop_words
print(len(stop_words), type(stop_words))

326 <class 'set'>


In [19]:
stop_words = lemmat(str(stop_words))

In [20]:
stop_words = set(stop_words)

In [21]:
print(len(stop_words), type(stop_words))

320 <class 'set'>


### 7: Instantiate the TfIdf vectorizer

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=lemmat, stop_words=stop_words)

### 7b: Test the TfIdf vectorizer

In [23]:
#test the Tfidf
import copy
tfidf_copy = copy.deepcopy(tfidf)
matrix = tfidf_copy.fit_transform(df['review'])
print(matrix.shape, type(matrix))
matrix_dense = matrix.todense()
print(matrix_dense.shape, type(matrix_dense))

(17, 158) <class 'scipy.sparse.csr.csr_matrix'>
(17, 158) <class 'numpy.matrix'>


In [24]:
arr = np.array(matrix_dense)
print(arr.shape, type(arr))

(17, 158) <class 'numpy.ndarray'>


In [25]:
column_names = tfidf_copy.get_feature_names()
print(len(column_names))
print(column_names[0:20])

158
['1', '2', '2019', '2020', '3', 'able', 'agency', 'answer', 'application', 'applied', 'appreciative', 'april', 'arrange', 'arranged', 'ask', 'asked', 'asking', 'attend', 'awkward', 'bad']


In [26]:
df_tfidf = pd.DataFrame(arr, columns=column_names)
print(df_tfidf.shape)
df_tfidf.head()

(17, 158)


Unnamed: 0,1,2,2019,2020,3,able,agency,answer,application,applied,...,unprofessional,untrained,updates,vibes,warm,weeks,willing,wo,work,worst
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.446171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.17245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.262727,0.0,0.262727,0.0,0.187432,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.460076,0.0,0.0,0.0,0.0,0.0,0.279027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209706


In [27]:
df_tfidf.iloc[11 : 12]

Unnamed: 0,1,2,2019,2020,3,able,agency,answer,application,applied,...,unprofessional,untrained,updates,vibes,warm,weeks,willing,wo,work,worst
11,0.0,0.119224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.181637,0.0,0.181637,0.0,0.0,0.129582,0.0,0.0,0.0,0.0


### 7c: Determine cosine similarity of query words against each document

In [28]:
import spacy
nlp = spacy.load('en_core_web_sm')

query_string = "untrained updates"
queries: list = [token.lemma_ for token in nlp(query_string)]

In [29]:
print(queries)
column_names = list(df_tfidf.columns)

['untrained', 'updates']


In [30]:
df.insert(2, "cosine_similarity", 0.0)
df.head()

Unnamed: 0,label,review,cosine_similarity
0,0,I applied online I interviewed at ST Engineering,0.0
1,-1,The process took about 2 weeks HR did not info...,0.0
2,-1,I was given an interview date and time after s...,0.0
3,0,I applied online I interviewed at ST Engineeri...,0.0
4,-1,Overall HR did good job to arrange the intervi...,0.0


In [31]:
from sklearn.metrics.pairwise import cosine_similarity

#refer to https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/ 
#main concept is that to compare cosine similarity, the search string must be represented as a vector, 
#with the same number of columns as the tfidf array
#you are comparing the query vector against each document

query_indexes = [column_names.index(query) for query in queries if query in column_names]
print(query_indexes)
#e.g. [158, 159]

for index, document in df_tfidf.iterrows():    
    query_vector = np.zeros(len(df_tfidf.columns), )
    #print(document.shape, query_vector.shape)
    #e.g. (167,) (167,)  #must be same shape
            
    for query_index in query_indexes:           
                    
        query: str = column_names[query_index]
        doc = nlp(df.loc[index, "review"])      
        doc_texts = [token.text for token in doc]
    
        if (query not in doc_texts):
            query_vector[query_index] = 0
            continue
            
        old_tf: float = doc_texts.count(query) / len(doc_texts)  #e.g. 0.022222222222222223        
        old_tfidf: float = document[query_index] #e.g. 0.23875188851084744        
        idf: float = old_tfidf / old_tf  #e.g. 10.743834982988135
        new_tf: float = queries.count(query) / len(queries)   #e.g. 0.5
        new_tfidf: float = idf * new_tf   #e.g. 5.3719174914940675
        
        query_vector[query_index] = new_tfidf
        
    similarity = cosine_similarity([query_vector], [document.to_numpy()])   
    
    
    
    if (similarity > 0):
        print(similarity.shape)
        df.at[index, "cosine_similarity"] = similarity[0][0]
    
    
    

[149, 150]
(1, 1)
(1, 1)


In [32]:
df.sort_values('cosine_similarity', inplace=True, ascending=False)
df.head(40)

Unnamed: 0,label,review,cosine_similarity
1,-1,The process took about 2 weeks HR did not info...,0.262727
11,-1,Was asked to attend first round and second rou...,0.181637
0,0,I applied online I interviewed at ST Engineering,0.0
9,1,Applied through company career HR process appl...,0.0
15,0,Two interviews first is panel while second is ...,0.0
14,0,I applied through a staffing agency The proces...,0.0
13,1,Hiring process is fast gotten offer immediatel...,0.0
12,0,I applied online The process took 2 weeks I in...,0.0
10,0,The process took 3 months I interviewed at ST ...,0.0
8,0,I applied online I interviewed at ST Engineering,0.0


In [33]:
text_contents = [1, 1, 1, 1]
df2 = pd.DataFrame({'cosine_similarities': np.zeros(len(text_contents)), 'review': text_contents})
df2.head()

Unnamed: 0,cosine_similarities,review
0,0.0,1
1,0.0,1
2,0.0,1
3,0.0,1


### 8: Instantiate the Classifier

In [34]:
from sklearn.svm import LinearSVC
svc = LinearSVC()
print(type(svc))

<class 'sklearn.svm.classes.LinearSVC'>


### 9: Create the Pipeline

In [35]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf', TfidfVectorizer(tokenizer=lemmat)), ('clf', LinearSVC())])

In [36]:
print(X_train)

8      I applied online I interviewed at ST Engineering
13    Hiring process is fast gotten offer immediatel...
2     I was given an interview date and time after s...
9     Applied through company career HR process appl...
16    I applied online I interviewed at ST Engineeri...
4     Overall HR did good job to arrange the intervi...
7     Casual chat in warm office hiring manager high...
10    The process took 3 months I interviewed at ST ...
12    I applied online The process took 2 weeks I in...
3     I applied online I interviewed at ST Engineeri...
6     After an online application they review my CV ...
Name: review, dtype: object


In [37]:
print(y_train)

8     0
13    1
2    -1
9     1
16    0
4    -1
7     1
10    0
12    0
3     0
6     0
Name: label, dtype: int64


In [38]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function lemmat at 0x000001B4EFB9C4C8>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, i

### 10: Run predictions and evaluate accuracy

In [39]:
predictions = text_clf.predict(X_test)
print(predictions.shape)
print(type(predictions))
print(type(X_test), X_test[0])
print(X_test)
result = text_clf.predict(pd.Series("I applied online I interviewed at ST Engineering"))
print(result[0])
print(float(result), type(result))


(6,)
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'> I applied online I interviewed at ST Engineering
0      I applied online I interviewed at ST Engineering
1     The process took about 2 weeks HR did not info...
5     I applied online The process took 2 months I i...
15    Two interviews first is panel while second is ...
11    Was asked to attend first round and second rou...
14    I applied through a staffing agency The proces...
Name: review, dtype: object
0
0.0 <class 'numpy.ndarray'>


In [40]:
print(predictions, type(predictions))

df_X_test_predictions = pd.DataFrame({
    'predicted sentiment': list(predictions),
    'actual sentiment': list(y_test),
    'review': list(X_test)
})

print("X_test results")
print(type(df_X_test_predictions.iloc[1,0]))

df_X_test_predictions.head()

[ 0  1  0  1 -1  0] <class 'numpy.ndarray'>
X_test results
<class 'numpy.int64'>


Unnamed: 0,predicted sentiment,actual sentiment,review
0,0,0,I applied online I interviewed at ST Engineering
1,1,-1,The process took about 2 weeks HR did not info...
2,0,0,I applied online The process took 2 months I i...
3,1,0,Two interviews first is panel while second is ...
4,-1,-1,Was asked to attend first round and second rou...


In [41]:
# Report the confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
conf_matrix = confusion_matrix(y_test, predictions)
print(conf_matrix)
print(conf_matrix.tolist())

[[1 0 1]
 [0 3 1]
 [0 0 0]]
[[1, 0, 1], [0, 3, 1], [0, 0, 0]]


In [42]:
dict_conf_matrix = dict(np.ndenumerate(conf_matrix))
print(dict_conf_matrix, type(dict_conf_matrix))
list_conf_matrix = list(dict_conf_matrix.values())
print(list_conf_matrix)

{(0, 0): 1, (0, 1): 0, (0, 2): 1, (1, 0): 0, (1, 1): 3, (1, 2): 1, (2, 0): 0, (2, 1): 0, (2, 2): 0} <class 'dict'>
[1, 0, 1, 0, 3, 1, 0, 0, 0]


In [43]:
#get overall accuracy
from sklearn import metrics
accuracy = metrics.accuracy_score(y_test, predictions)
print(accuracy)

0.6666666666666666


In [44]:
#get classification report
classif_report = classification_report(y_test, predictions)
print(classif_report)

  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

          -1       1.00      0.50      0.67         2
           0       1.00      0.75      0.86         4
           1       0.00      0.00      0.00         0

    accuracy                           0.67         6
   macro avg       0.67      0.42      0.51         6
weighted avg       1.00      0.67      0.79         6

