# Final project code

In [117]:
import json
import numpy as np
import re
import copy
import gensim
import nltk
import string 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics import f1_score, accuracy_score
import gensim.downloader as api

In [45]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [92]:
with open('training_jsons/apple.json', 'r') as f:
    apple_data = json.load(f)
with open('training_jsons/google.json', 'r') as f:
    google_data = json.load(f)
with open('training_jsons/microsoft.json', 'r') as f:
    microsoft_data = json.load(f)

In [93]:
data = apple_data + google_data + microsoft_data

In [94]:
print(data[232]) #Remove \r\n(-: 

{'offer_status': 'No Offer', 'experience': 'Negative Experience', 'difficulty': 'Easy Interview', 'review': '10 minute speed run, very abrupt with unengaged interviewers. Asked behavioral questions and about previous projects. Describe a previous coding project, interests, Why Apple? \n\nDid not find this format engaging, thought I do like the efficiency of the structure. Interviewers did not seem to care or have enough time to get to know candidates', 'page': 120}


In [95]:
#Types of offers
offer = set()
experience = set()
difficulty = set()
for i in data:
    offer.add(i["offer_status"])
    experience.add(i["experience"])
    difficulty.add(i["difficulty"])
print(offer)
print(experience)
print(difficulty)

{'Declined Offer', 'No Offer', 'Accepted Offer'}
{'Neutral Experience', 'Positive Experience', 'Negative Experience'}
{'Average Interview', 'Easy Interview', 'Difficult Interview'}


In [28]:
y = []
X = []
#0 represents no offer
#1 represents declined offer or accepted offer
#Decide if we should predict offer or the experience?
#Preprocessing and creating y_matrix

#Future step: Tokenize to remove stop words to only include stems of words?



for i in range(len(data)):
    #Remove \w \s \n
    data[i]["review"] = data[i]["review"].strip().replace('\n', '').replace('\r', '').lower()
    data[i]["review"] = re.sub(r'[^\w\s\n\r]', '', data[i]["review"]).strip()
    #Remove links
    data[i]["review"] = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', data[i]["review"])
    if data[i]["offer_status"] == 'No Offer':
        data[i]["offer_status"] = 0
        y.append(0)
    else:
        data[i]["offer_status"] = 1
        y.append(1)
    del data[i]['offer_status']
print(y[1:10])
print(data[0])

[1, 1, 1, 1, 0, 1, 0, 1, 1]
{'experience': 'Positive Experience', 'difficulty': 'Average Interview', 'review': 'initial phone interview 15mins received an email 2 days after applying inviting me to schedule a time to call with an apple retail recruiter who was based in california applying for job in texas was told at the end of the phone call that i would be advancing to the next step invited to an optional get to know apple web event 30mins which went over some basics about working in apple retail and company culturegroup interview 45mins group interview with manager from the apple store i was applying to and 3 other interviewees 4 interviewees total took place online via their webex platform similar to zoom as of aug 2021 most interviewees were online 105 minutes before the interview was scheduled to begin and an icebreaker question was asked about a minute before the scheduled start time the invite email suggested we use a digital background for our own privacy most felt comfortable

In [96]:
#Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.33, random_state=42)

In [97]:
embed = api.load('word2vec-google-news-300')

In [98]:
#Define pretrained model
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/dinakartalluri/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [99]:
#feature extraction

#Create features for each review
#First interation
#Interview experience (numeric), difficulty (numeric), avg word embedding for review? (use glove embedding), # of words added to each axis(pick valid threshold)(divide by length of text) (future), use pretrained classifiers (next step)
def extract_features(embed, data, include, sid):
    X_features = np.zeros((len(data), 4))
    if not include:
        X_features = np.zeros((len(data), 3))
    for i, d in enumerate(data):
        feature_1 = 0
        feature_2 = 0

        if d["experience"] == "Positive Experience":
            feature_1 = 1
        elif d["experience"] == "Negative Experience":
            feature_1 = -1

        if d["difficulty"] == "Easy Interview":
            feature_2 = 1
        elif d["difficulty"] == "Difficult Interview":
            feature_2 = -1
        
        tokenized = word_tokenize(d["review"])
        stopwords_english = stopwords.words('english')
        
        feature_3 = 0
        number_embed = 0
        for token in tokenized:
            if token not in stopwords_english and token not in string.punctuation and token[0].isdigit() == False:
                if token in embed:
                    feature_3+=embed[token]
                    number_embed+=1
        feature_3/=number_embed
        feature_3 = np.array(feature_3).mean()
        
        out = sid.polarity_scores(d["review"])
        feature_4 = out['compound']
        
        X_features[i] = np.array([feature_1, feature_2, feature_3, feature_4])
    return X_features
            

In [114]:
#Get the features

X_train_features_with_i = extract_features(embed, X_train, True, sid)
X_test_features_with_i = extract_features(embed, X_test, True, sid)
X_train_features_with_i[0]


array([1.000000e+00, 0.000000e+00, 5.524726e-04, 2.732000e-01])

In [113]:
#fit LR, tuning
params={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
lr=LogisticRegression(solver='liblinear')
lr_cv=GridSearchCV(lr,params,cv=10)
lr_cv.fit(X_train_features_with_i,y_train)
print("accuracy :",lr_cv.best_score_)
print("best parameters ",lr_cv.best_params_)

accuracy : 0.6705748259098674
best parameters  {'C': 0.01, 'penalty': 'l2'}


In [119]:
lr=LogisticRegression(C=0.01,penalty="l2")
lr.fit(X_train_features_with_i,y_train)
y_pred = lr.predict(X_test_features_with_i)
print(f1_score(y_test, y_pred, average='weighted'))
print(f1_score(y_test, y_pred, average='micro'))
print(accuracy_score(y_test, y_pred))

0.6205408684421984
0.6598151062155783
0.6598151062155783
