# Final project code

In [10]:
import json
import numpy as np
import re
import copy
import gensim
import nltk
import string 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics import f1_score, accuracy_score
import gensim.downloader as api

In [11]:
with open('training_jsons/apple.json', 'r') as f:
    apple_data = json.load(f)
with open('training_jsons/google.json', 'r') as f:
    google_data = json.load(f)
with open('training_jsons/microsoft.json', 'r') as f:
    microsoft_data = json.load(f)

In [12]:
#Combine all of the json
data = apple_data + google_data + microsoft_data

In [13]:
print(data[232]) #Example data

{'offer_status': 'No Offer', 'experience': 'Negative Experience', 'difficulty': 'Easy Interview', 'review': '10 minute speed run, very abrupt with unengaged interviewers. Asked behavioral questions and about previous projects. Describe a previous coding project, interests, Why Apple? \n\nDid not find this format engaging, thought I do like the efficiency of the structure. Interviewers did not seem to care or have enough time to get to know candidates', 'page': 120}


In [14]:
#View the three sets
offer = set()
experience = set()
difficulty = set()
for i in data:
    offer.add(i["offer_status"])
    experience.add(i["experience"])
    difficulty.add(i["difficulty"])
print(offer)
print(experience)
print(difficulty)

{'No Offer', 'Declined Offer', 'Accepted Offer'}
{'Negative Experience', 'Positive Experience', 'Neutral Experience'}
{'Easy Interview', 'Average Interview', 'Difficult Interview'}


In [15]:
#Clean Data
y = []
X = []
#0 represents no offer
#1 represents declined offer or accepted offer


for i in range(len(data)):
    #Remove \w \s \n
    data[i]["review"] = data[i]["review"].strip().replace('\n', '').replace('\r', '').lower()
    data[i]["review"] = re.sub(r'[^\w\s\n\r]', '', data[i]["review"]).strip()
    #Remove links
    data[i]["review"] = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', data[i]["review"])
    if data[i]["offer_status"] == 'No Offer':
        data[i]["offer_status"] = 0
        y.append(0)
    else:
        data[i]["offer_status"] = 1
        y.append(1)
    del data[i]['offer_status']
print(y[1:10])
print(data[0])

[1, 1, 1, 1, 0, 1, 0, 1, 1]
{'experience': 'Positive Experience', 'difficulty': 'Average Interview', 'review': 'initial phone interview 15mins received an email 2 days after applying inviting me to schedule a time to call with an apple retail recruiter who was based in california applying for job in texas was told at the end of the phone call that i would be advancing to the next step invited to an optional get to know apple web event 30mins which went over some basics about working in apple retail and company culturegroup interview 45mins group interview with manager from the apple store i was applying to and 3 other interviewees 4 interviewees total took place online via their webex platform similar to zoom as of aug 2021 most interviewees were online 105 minutes before the interview was scheduled to begin and an icebreaker question was asked about a minute before the scheduled start time the invite email suggested we use a digital background for our own privacy most felt comfortable

In [18]:
count0 = 0
count1 = 0
for i in y:
    if i == 1:
        count1+=1
    else:
        count0+=1
print("Number of No Offers:", count0)
print("Percentage of No Offers:", (count0/(count1+count0))*100)

Number of No Offers: 19810
Percentage of No Offers: 64.29730606945797


In [19]:
#Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.33, random_state=42)

In [20]:
embed = api.load('word2vec-google-news-300')

In [21]:
#Define pretrained model
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/dinakartalluri/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [22]:
#Create the lexicon feature

def lexicon_feature(embed, tokens, word1, word2, t_p, t_n, pos_words, neg_words):
    pos_pole = np.zeros((300,), dtype=float)
    p_count = 0
    neg_pole = np.zeros((300,), dtype=float)
    n_count = 0
    #Found synonyms on words on thesauras.com
    
    for word in pos_words:
        if word in embed:
            pos_pole+=embed[word]
            p_count+=1
    
    for word in neg_words:
        if word in embed:
            neg_pole+=embed[word]
            n_count+=1
    
    pos_pole/= p_count
    neg_pole/= n_count
            
    length = len(tokens)
    pos_words_count = 0
    neg_words_count = 0
    
    axis = pos_pole - neg_pole
    
    for token in tokens:
        if token in embed:
            embed_word = embed[token]
            cos_similarity = np.dot(embed_word, axis)/(np.linalg.norm(embed_word)*np.linalg.norm(axis))
            if cos_similarity > t_p:
                pos_words_count+=1
            elif cos_similarity < t_n:
                neg_words_count+=1
    pos_words_count/=length
    neg_words_count/=length
    
    return pos_words_count, neg_words_count

In [23]:
#Feature extraction

def extract_features(embed, data, include, sid):
    X_features = np.zeros((len(data), 6))
    if not include:
        X_features = np.zeros((len(data), 3))
    for i, d in enumerate(data):
        feature_1 = 0
        feature_2 = 0

        if d["experience"] == "Positive Experience":
            feature_1 = 1
        elif d["experience"] == "Negative Experience":
            feature_1 = -1

        if d["difficulty"] == "Easy Interview":
            feature_2 = 1
        elif d["difficulty"] == "Difficult Interview":
            feature_2 = -1
        
        tokenized = word_tokenize(d["review"])
        stopwords_english = stopwords.words('english')
        
        feature_3 = 0
        number_embed = 0
        tokens = []
        for token in tokenized:
            if token not in stopwords_english and token not in string.punctuation and token[0].isdigit() == False:
                if token in embed:
                    tokens.append(token)
                    feature_3+=embed[token]
                    number_embed+=1
        feature_3/=number_embed
        feature_3 = np.array(feature_3).mean()
        
        out = sid.polarity_scores(d["review"])
        feature_4 = out['compound']
        
        pos_words = ["accomplishment", "advance", "benefit", "gain", "happiness", "progress", "triumph", "victory", "win"]
        neg_words = ["breakdown", "collapse", "decline", "defeat", "loss", "misstep", "deterioration"]
        pos_count, neg_count = lexicon_feature(embed, tokens, "success", "failure", 0.2, -0.2, pos_words, neg_words)
        #Check the ranges of +-0.4, +-0.3, and +-0.2 to figure out which ones look better (+-0.2 much better)
        if include:
            X_features[i] = np.array([feature_1, feature_2, feature_3, feature_4, pos_count, neg_count])
        else:
            X_features[i] = np.array([feature_1, feature_2, feature_3])
    return X_features
            

In [24]:
#Get the features

X_train_features_with_i = extract_features(embed, X_train, True, sid)
X_test_features_with_i = extract_features(embed, X_test, True, sid)
X_train_features_with_i[100]


array([ 1.        , -1.        , -0.00293171,  0.9979    ,  0.00236407,
        0.00472813])

In [25]:
#fit LR, hyperparam tuning
params={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
lr=LogisticRegression(solver='liblinear')
lr_cv=GridSearchCV(lr,params,cv=10)
lr_cv.fit(X_train_features_with_i,y_train)
print("accuracy :",lr_cv.best_score_)
print("best parameters ",lr_cv.best_params_)

accuracy : 0.6703810274602549
best parameters  {'C': 0.01, 'penalty': 'l2'}


In [27]:
#Feature selection with Logistic Regression
sel_ = SelectFromModel(LogisticRegression(C=lr_cv.best_params_['C'], penalty=lr_cv.best_params_['penalty'], solver='liblinear'))
sel_.fit(X_train_features_with_i, y_train)
sel_.get_support()

array([ True,  True, False,  True, False, False])

#### Without feature selection

In [33]:
lr=LogisticRegression(C=0.01,penalty="l2")
lr.fit(X_train_features_with_i,y_train)
y_pred = lr.predict(X_test_features_with_i)
print(f1_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='micro'))
print(accuracy_score(y_test, y_pred))

0.6222392423386456
0.6608969315499607
0.6608969315499607


#### With feature selection

In [33]:
lr.fit(X_train_features_with_i[:, [0,1,3]],y_train)
y_pred = lr.predict(X_test_features_with_i[:, [0,1,3]])
print(f1_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='micro'))
print(accuracy_score(y_test, y_pred))

0.6315096657659055
0.6606018882769473
0.6606018882769473


In [47]:
#FFNN hyperparam tuning
def nn_hyperparameter_tuning(X, y):
    
    hyperparameters = [[(16,8), 0.001, 0.001],[(16,8), 0.01, 0.01], [(32,8), 0.01, 0.001], [(32,8), 0.001, 0.01]]
    best_score = 0
    best_hyperparameter = {'hidden_layer': 0, 'initial_lr': 0, 'reg_strength': 0}
    
    for params in hyperparameters:
        clf = MLPClassifier(hidden_layer_sizes=params[0], learning_rate_init=params[1], alpha=params[2],activation='relu', tol=1e-3, solver = 'adam')
        score = cross_val_score(clf, X, y, cv=5, scoring= 'f1_macro').mean()
        if score > best_score:
            best_score = score
            best_hyperparameter['hidden_layer'] = params[0]
            best_hyperparameter['initial_lr'] = params[1]
            best_hyperparameter['reg_strength'] = params[2]
        print("Score: ", score)
    
    return best_hyperparameter

In [48]:
#Best params
hyperparameter = nn_hyperparameter_tuning(X_train_features_with_i, y_train)
hidden_layers = hyperparameter['hidden_layer']
lr = hyperparameter['initial_lr']
reg = hyperparameter['reg_strength']

Score:  0.5842112481866875
Score:  0.5737561544006925
Score:  0.5716751360468451
Score:  0.5906239668109957


In [49]:
print(hidden_layers, lr, reg)

(32, 8) 0.001 0.01


#### With feature selection (random search)

In [58]:
lst = [[0,1,2,3,4], [0,1,3,4], [3,4,5], [0,1,4,5], [0,1,3], [1,3,5], [0,2,4]]
best_score = 0
best_i = 0
for i in lst:
    clf = MLPClassifier(hidden_layer_sizes=hidden_layers, learning_rate_init=lr, alpha=reg,activation='relu', tol=1e-3, solver = 'adam').fit(X_train_features_with_i[:,i],y_train)
    y_pred_nn = clf.predict(X_test_features_with_i[:,i])
    score = f1_score(y_test, y_pred_nn, average='weighted')
    if score > best_score:
        best_score = score
        best_i = i
print(best_score)
print(best_i)

0.6433281612562637
[0, 1, 2, 3, 4]


#### Without feature selection

In [57]:
clf = MLPClassifier(hidden_layer_sizes=hidden_layers, learning_rate_init=lr, alpha=reg,activation='relu', tol=1e-3, solver = 'adam').fit(X_train_features_with_i,y_train)
y_pred_nn = clf.predict(X_test_features_with_i)
print(f1_score(y_test, y_pred_nn, average='weighted'))
print(f1_score(y_test, y_pred_nn, average='micro'))
print(accuracy_score(y_test, y_pred_nn))

0.6311704310228133
0.6612903225806451
0.6612903225806451


#### Baseline model

In [36]:
X_train_features_baseline = extract_features(embed, X_train, False, sid)
X_test_features_baseline = extract_features(embed, X_test, False, sid)
X_train_features_baseline[100]

array([ 1.        , -1.        , -0.00293171])

In [37]:
lr=LogisticRegression(C=0.01,penalty="l2")
lr.fit(X_train_features_baseline,y_train)
y_pred = lr.predict(X_test_features_baseline)
print(f1_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='micro'))
print(accuracy_score(y_test, y_pred))

0.5890877960465329
0.6569630212431157
0.6569630212431157
