# Count Vect Modeling for Pros and Cons UNIGRAMS

In [1]:
import wrangle 
import nltk
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from scipy.stats import ttest_ind
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  # You can choose a different model
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import xgboost

# Baseline

In [2]:
original, uni_count_vect, bi_count_vect, tri_count_vect = wrangle.wrangle_glassdoor()

In [3]:
train, val, test = bi_count_vect

In [4]:
most_common = train["binned_rating_int"].value_counts().idxmax()
most_common_frequency = train["binned_rating_int"].value_counts().max()

baseline_accuracy = most_common_frequency / len(train)

print(f"Most Common Class: {most_common}")
print(f"Baseline Accuracy: {baseline_accuracy:.2f}")



Most Common Class: 3
Baseline Accuracy: 0.70


## Decision Tree

In [5]:
X_train = train[['pros_work life', 'pros_life balance', 'pros_work environment', 'pros_good benefit', 'pros_worklife balance', 'cons_work life', 'cons_life balance', 'cons_work environment', 'cons_good benefit', 'cons_worklife balance']]
y_train = train["binned_rating_int"]

X_val = val[['pros_work life', 'pros_life balance', 'pros_work environment', 'pros_good benefit', 'pros_worklife balance', 'cons_work life', 'cons_life balance', 'cons_work environment', 'cons_good benefit', 'cons_worklife balance']]
y_val = val["binned_rating_int"]

X_test = test[['pros_work life', 'pros_life balance', 'pros_work environment', 'pros_good benefit', 'pros_worklife balance', 'cons_work life', 'cons_life balance', 'cons_work environment', 'cons_good benefit', 'cons_worklife balance']]
y_test = test["binned_rating_int"]


In [6]:
param_grid_tree = {
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
}

# Create the Decision Tree classifier
tree = DecisionTreeClassifier(random_state=42)

# Create GridSearchCV with the Decision Tree classifier and hyperparameter grid
grid_search_tree = GridSearchCV(tree, param_grid_tree, cv=5, scoring='accuracy')

# Fit the model with the training data
grid_search_tree.fit(X_train, y_train)

# Get the best hyperparameters from the search
best_params_tree = grid_search_tree.best_params_

# Get the best estimator (model) from the search
best_tree = grid_search_tree.best_estimator_

# Make predictions on the validation set using the best model
y_val_pred_tree = best_tree.predict(X_val)

# Calculate accuracy on the validation set
val_accuracy_tree = accuracy_score(y_val, y_val_pred_tree)

# Make predictions on the training set using the best Decision Tree model
y_train_pred_tree = best_tree.predict(X_train)

# Calculate accuracy on the training set
train_accuracy_tree = accuracy_score(y_train, y_train_pred_tree)

print("Decision Tree:")
print(f"Best Hyperparameters: {best_params_tree}")
print(f"Validation Accuracy: {val_accuracy_tree:.2f}")
print(f"Training Accuracy: {train_accuracy_tree:.2f}")


Decision Tree:
Best Hyperparameters: {'max_depth': 10, 'min_samples_split': 5}
Validation Accuracy: 0.56
Training Accuracy: 0.89


In [7]:
feature_importance = best_tree.feature_importances_
feature_importance

feature_names = X_train.columns

# Create a dictionary mapping feature names to their importance scores
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Print feature importances
for feature, importance in feature_importance_dict.items():
    print(f"Feature: {feature}, Importance: {importance:.4f}")

Feature: pros_work life, Importance: 0.1002
Feature: pros_life balance, Importance: 0.0742
Feature: pros_work environment, Importance: 0.1952
Feature: pros_good benefit, Importance: 0.1711
Feature: pros_worklife balance, Importance: 0.1870
Feature: cons_work life, Importance: 0.0360
Feature: cons_life balance, Importance: 0.0532
Feature: cons_work environment, Importance: 0.0733
Feature: cons_good benefit, Importance: 0.0379
Feature: cons_worklife balance, Importance: 0.0717


# RandomForrest

In [8]:
# Define hyperparameter grid to search for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Maximum depth of trees
}

# Create the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Create GridSearchCV with the Random Forest classifier and hyperparameter grid
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')

# Fit the model with the training data
grid_search_rf.fit(X_train, y_train)

# Get the best hyperparameters from the search
best_params_rf = grid_search_rf.best_params_

# Get the best estimator (model) from the search
best_rf = grid_search_rf.best_estimator_

# Make predictions on the validation set using the best model
y_val_pred_rf = best_rf.predict(X_val)

# Calculate accuracy on the validation set
val_accuracy_rf = accuracy_score(y_val, y_val_pred_rf)

# Make predictions on the training set using the best Random Forest model
y_train_pred_rf = best_rf.predict(X_train)

# Calculate accuracy on the training set
train_accuracy_rf = accuracy_score(y_train, y_train_pred_rf)

print("Random Forest:")
print(f"Best Hyperparameters: {best_params_rf}")
print(f"Validation Accuracy: {val_accuracy_rf:.2f}")
print(f"Training Accuracy: {train_accuracy_rf:.2f}")



Random Forest:
Best Hyperparameters: {'max_depth': 10, 'n_estimators': 200}
Validation Accuracy: 0.64
Training Accuracy: 0.93


In [9]:
feature_importance = best_rf.feature_importances_
feature_importance

feature_names = X_train.columns

# Create a dictionary mapping feature names to their importance scores
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Print feature importances
for feature, importance in feature_importance_dict.items():
    print(f"Feature: {feature}, Importance: {importance:.4f}")

Feature: pros_work life, Importance: 0.1375
Feature: pros_life balance, Importance: 0.1302
Feature: pros_work environment, Importance: 0.1260
Feature: pros_good benefit, Importance: 0.1166
Feature: pros_worklife balance, Importance: 0.1652
Feature: cons_work life, Importance: 0.0690
Feature: cons_life balance, Importance: 0.0674
Feature: cons_work environment, Importance: 0.0845
Feature: cons_good benefit, Importance: 0.0224
Feature: cons_worklife balance, Importance: 0.0812


# KNN

In [10]:
# Define hyperparameter grid to search for KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weighting scheme
}

# Create the KNN classifier
knn = KNeighborsClassifier()

# Create GridSearchCV with the KNN classifier and hyperparameter grid
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='accuracy')

# Fit the model with the training data
grid_search_knn.fit(X_train, y_train)

# Get the best hyperparameters from the search
best_params_knn = grid_search_knn.best_params_

# Get the best estimator (model) from the search
best_knn = grid_search_knn.best_estimator_

# Make predictions on the validation set using the best model
y_val_pred_knn = best_knn.predict(X_val)

# Calculate accuracy on the validation set
val_accuracy_knn = accuracy_score(y_val, y_val_pred_knn)

# Make predictions on the training set using the best KNN model
y_train_pred_knn = best_knn.predict(X_train)

# Calculate accuracy on the training set
train_accuracy_knn = accuracy_score(y_train, y_train_pred_knn)

print("K-Nearest Neighbors:")
print(f"Best Hyperparameters: {best_params_knn}")
print(f"Validation Accuracy: {val_accuracy_knn:.2f}")
print(f"Training Accuracy: {train_accuracy_knn:.2f}")


K-Nearest Neighbors:
Best Hyperparameters: {'n_neighbors': 7, 'weights': 'distance'}
Validation Accuracy: 0.66
Training Accuracy: 1.00


# LogiReg

In [11]:
# Define hyperparameter grid to search for Logistic Regression
param_grid_logreg = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']  # Solver options
}

# Create the Logistic Regression classifier
logreg = LogisticRegression(random_state=42)

# Create GridSearchCV with the Logistic Regression classifier and hyperparameter grid
grid_search_logreg = GridSearchCV(logreg, param_grid_logreg, cv=5, scoring='accuracy')

# Fit the model with the training data
grid_search_logreg.fit(X_train, y_train)

# Get the best hyperparameters from the search
best_params_logreg = grid_search_logreg.best_params_

# Get the best estimator (model) from the search
best_logreg = grid_search_logreg.best_estimator_

# Make predictions on the validation set using the best model
y_val_pred_logreg = best_logreg.predict(X_val)

# Calculate accuracy on the validation set
val_accuracy_logreg = accuracy_score(y_val, y_val_pred_logreg)

# Make predictions on the training set using the best Logistic Regression model
y_train_pred_logreg = best_logreg.predict(X_train)

# Calculate accuracy on the training set
train_accuracy_logreg = accuracy_score(y_train, y_train_pred_logreg)

print("Logistic Regression:")
print(f"Best Hyperparameters: {best_params_logreg}")
print(f"Validation Accuracy: {val_accuracy_logreg:.2f}")
print(f"Training Accuracy: {train_accuracy_logreg:.2f}")


Logistic Regression:
Best Hyperparameters: {'C': 0.01, 'solver': 'saga'}
Validation Accuracy: 0.69
Training Accuracy: 0.73


# Naive Bayes

In [12]:
# Create the Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Fit the model with the training data
gnb.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred_gnb = gnb.predict(X_val)

# Calculate accuracy on the validation set
val_accuracy_gnb = accuracy_score(y_val, y_val_pred_gnb)

# Make predictions on the training set
y_train_pred_gnb = gnb.predict(X_train)

# Calculate accuracy on the training set
train_accuracy_gnb = accuracy_score(y_train, y_train_pred_gnb)

print("Naive Bayes (Gaussian):")
print(f"Validation Accuracy: {val_accuracy_gnb:.2f}")
print(f"Training Accuracy: {train_accuracy_gnb:.2f}")


Naive Bayes (Gaussian):
Validation Accuracy: 0.68
Training Accuracy: 0.73


# XG BOOST

In [13]:
# Map class labels to 0 and 1
y_train = y_train.map({3: 0, 4: 1})
y_val = y_val.map({3: 0, 4: 1})

# Define hyperparameter grid to search for XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'n_estimators': [100, 200, 300]
}

# Create the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42)

# Create GridSearchCV with the XGBoost classifier and hyperparameter grid
grid_search_xgb = GridSearchCV(xgb_classifier, param_grid_xgb, cv=5, scoring='accuracy')

# Fit the model with the training data
grid_search_xgb.fit(X_train, y_train)

# Get the best hyperparameters from the search
best_params_xgb = grid_search_xgb.best_params_

# Get the best estimator (model) from the search
best_xgb = grid_search_xgb.best_estimator_

# Make predictions on the validation set using the best model
y_val_pred_xgb = best_xgb.predict(X_val)

# Calculate accuracy on the validation set
val_accuracy_xgb = accuracy_score(y_val, y_val_pred_xgb)

# Make predictions on the training set using the best XGBoost model
y_train_pred_xgb = best_xgb.predict(X_train)

# Calculate accuracy on the training set
train_accuracy_xgb = accuracy_score(y_train, y_train_pred_xgb)

print("XGBoost:")
print(f"Best Hyperparameters: {best_params_xgb}")
print(f"Validation Accuracy: {val_accuracy_xgb:.2f}")
print(f"Training Accuracy: {train_accuracy_xgb:.2f}")


XGBoost:
Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
Validation Accuracy: 0.68
Training Accuracy: 0.76


In [14]:
feature_importance = best_xgb.feature_importances_
feature_importance

feature_names = X_train.columns

# Create a dictionary mapping feature names to their importance scores
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Print feature importances
for feature, importance in feature_importance_dict.items():
    print(f"Feature: {feature}, Importance: {importance:.4f}")

Feature: pros_work life, Importance: 0.1459
Feature: pros_life balance, Importance: 0.0767
Feature: pros_work environment, Importance: 0.0670
Feature: pros_good benefit, Importance: 0.0531
Feature: pros_worklife balance, Importance: 0.2152
Feature: cons_work life, Importance: 0.1245
Feature: cons_life balance, Importance: 0.0970
Feature: cons_work environment, Importance: 0.0829
Feature: cons_good benefit, Importance: 0.0776
Feature: cons_worklife balance, Importance: 0.0600


## Test

In [15]:
X_train = train[['pros_work life', 'pros_life balance', 'pros_work environment', 'pros_good benefit', 'pros_worklife balance', 'cons_work life', 'cons_life balance', 'cons_work environment', 'cons_good benefit', 'cons_worklife balance']]
y_train = train["binned_rating_int"]

X_val = val[['pros_work life', 'pros_life balance', 'pros_work environment', 'pros_good benefit', 'pros_worklife balance', 'cons_work life', 'cons_life balance', 'cons_work environment', 'cons_good benefit', 'cons_worklife balance']]
y_val = val["binned_rating_int"]

X_test = test[['pros_work life', 'pros_life balance', 'pros_work environment', 'pros_good benefit', 'pros_worklife balance', 'cons_work life', 'cons_life balance', 'cons_work environment', 'cons_good benefit', 'cons_worklife balance']]
y_test = test["binned_rating_int"]


logreg = LogisticRegression(C=0.01,solver="saga",random_state=42)

y_test_pred_logreg = best_logreg.predict(X_test)
test_accuracy_logreg = accuracy_score(y_test, y_test_pred_logreg)

print("Logistic Regression:")
print(f"Testing Accuracy: {test_accuracy_logreg:.2f}")

Logistic Regression:
Testing Accuracy: 0.70
