In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from matplotlib.ticker import FuncFormatter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from scipy.stats import kruskal
import scikit_posthocs as sp
from scipy.stats import f_oneway
from scipy import stats
import random
from scipy.stats import chi2_contingency
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [2]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

In [3]:
# Read in the cleaned data from previous notebook
df = pd.read_csv('../Data/clean_linkedin_job_posting.csv')
df.head()

Unnamed: 0,job_id,company_id,title,description,med_salary,pay_period,formatted_work_type,location,applies,original_listed_time,remote_allowed,views,application_type,expiry,formatted_experience_level,listed_time,sponsored,reposted,skills_present,application_portal
0,3757940104,553718.0,Hearing Care Provider,Overview\n\nHearingLife is a national hearing ...,5250.0,MONTHLY,Full-time,"Little River, SC",5.0,2023-11-04 05:26:40,0.0,9.0,OffsiteApply,2023-12-04 03:53:20,Entry level,2023-11-04 05:26:40,0,0,0,1
1,3757940025,2192142.0,Shipping & Receiving Associate 2nd shift (Beav...,Metalcraft of Mayville\nMetalcraft of Mayville...,73028.0,Not Specified,Full-time,"Beaver Dam, WI",5.0,2023-11-04 02:40:00,0.0,16.0,OffsiteApply,2023-12-04 03:53:20,Not Specified,2023-11-04 02:40:00,0,0,0,1
2,3757938019,474443.0,"Manager, Engineering",\nThe TSUBAKI name is synonymous with excellen...,73028.0,Not Specified,Full-time,"Bessemer, AL",5.0,2023-11-04 02:40:00,0.0,16.0,OffsiteApply,2023-12-04 03:53:20,Not Specified,2023-11-04 02:40:00,0,0,1,1
3,3757938018,18213359.0,Cook,descriptionTitle\n\n Looking for a great oppor...,22.27,HOURLY,Full-time,"Aliso Viejo, CA",5.0,2023-11-04 02:40:00,0.0,1.0,OffsiteApply,2023-12-04 03:53:20,Entry level,2023-11-04 02:40:00,0,0,0,1
4,3757937095,437225.0,Principal Cloud Security Architect (Remote),"Job Summary\nAt iHerb, we are on a mission to ...",240895.0,YEARLY,Full-time,United States,5.0,2023-11-02 20:06:40,1.0,16.0,OffsiteApply,2023-12-04 03:53:20,Mid-Senior level,2023-11-04 05:26:40,0,1,0,1


In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.select_dtypes(["object"]).columns

In [None]:
df.select_dtypes(["float", "int"]).columns

In [None]:
df['application_type'].value_counts()

In [None]:
df['application_method'] = df['application_type'].map({'OffsiteApply': 0, 'ComplexOnsiteApply': 1, 'SimpleOnsiteApply': 1})

In [None]:
# Check
df['application_method'].value_counts()

In [None]:
df.drop(columns='application_type', inplace=True)

# Sanity Check
df.info()

In [None]:
df['location'].value_counts()

In [None]:
# Create a new column 'is_usa' with binary values
def check_usa(location):
    if 'United States' in str(location):
        return 1
    elif ',' in str(location):
        return 1  # Assume it's in the USA if there's a comma (likely a city-state pair)
    else:
        return 0

df['is_usa'] = df['location'].apply(check_usa)

# Display the DataFrame with the new column
df['is_usa'].value_counts()

In [None]:
plt.figure()
df['is_usa'].value_counts(normalize=True).plot(kind='bar')
plt.title('Distribution of Location'.title())
plt.xlabel("USA vs World")
plt.ylabel("Frequency")
plt.show()

In [None]:
df.drop(columns='location', inplace=True)

# Sanity Check
df.info()

In [None]:
df['original_listed_time'].value_counts()

In [None]:
df['expiry'].value_counts()

In [None]:
df['listed_time'].value_counts()

In [None]:
# Convert the timestamp columns to datetime format
df['original_listed_time'] = pd.to_datetime(df['original_listed_time'])
df['expiry'] = pd.to_datetime(df['expiry'])
df['listed_time'] = pd.to_datetime(df['listed_time'])

In [None]:
# Check
df.info()

In [None]:
# Extract year, month, day, hour, minute, and second into separate columns
df['original_listed_year'] = df['original_listed_time'].dt.year
df['original_listed_month'] = df['original_listed_time'].dt.month
df['original_listed_day'] = df['original_listed_time'].dt.day
df['original_listed_hour'] = df['original_listed_time'].dt.hour
df['original_listed_minute'] = df['original_listed_time'].dt.minute
df['original_listed_second'] = df['original_listed_time'].dt.second

df['expiry_year'] = df['expiry'].dt.year
df['expiry_month'] = df['expiry'].dt.month
df['expiry_day'] = df['expiry'].dt.day
df['expiry_hour'] = df['expiry'].dt.hour
df['expiry_minute'] = df['expiry'].dt.minute
df['expiry_second'] = df['expiry'].dt.second

df['listed_year'] = df['listed_time'].dt.year
df['listed_month'] = df['listed_time'].dt.month
df['listed_day'] = df['listed_time'].dt.day
df['listed_hour'] = df['listed_time'].dt.hour
df['listed_minute'] = df['listed_time'].dt.minute
df['listed_second'] = df['listed_time'].dt.second

In [None]:
df.sample(50)

In [None]:
# Sanity Check
df.info()

In [None]:
# Drop the original_listed_time, expiry, and listed_time columns
df.drop(['original_listed_time', 'expiry', 'listed_time'], axis=1, inplace=True)

# Sanity Check
df.info()

In [None]:
# Creating dummy variables
dummy_variables = pd.get_dummies(df[['pay_period', 'formatted_work_type', 'formatted_experience_level']])

# Concatenating the dummy variables with the original DataFrame
df_dummies = pd.concat([df, dummy_variables], axis=1)

# Optionally, you might want to drop the original columns to avoid redundancy
df_dummies = df_dummies.drop(['pay_period', 'formatted_work_type', 'formatted_experience_level'], axis=1)

In [None]:
df_dummies.info()

## Text Analysis - NLP

In [None]:
# slice out target variable and independant variables
y = df_dummies['views']
X = df_dummies.drop('views', axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

# 30% test size and 70% train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [None]:
# Can use .shape to look at size of training data
X_train.shape

In [None]:
# Slice out title and description train and test reviews
title_X_train = X_train['title']
desc_X_train = X_train['description']

title_X_test = X_test['title']
desc_X_test = X_test['description']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate CountVectorizer
title_bow = CountVectorizer(max_features=500, min_df = 10)
desc_bow = CountVectorizer(max_features=500, min_df = 10)

# fit the model to training set
title_bow.fit(title_X_train)
desc_bow.fit(desc_X_train)

# transform the training set
title_train_transform = title_bow.transform(title_X_train)
desc_train_transform = desc_bow.transform(desc_X_train)

# transform the test set
title_test_transform = title_bow.transform(title_X_test)
desc_test_transform = desc_bow.transform(desc_X_test)

In [None]:
title_train_transform

In [None]:
desc_train_transform

In [None]:
# we can sum up the words in an array and store it in DataFrame()
word_counts = pd.DataFrame(
    {"counts": title_train_transform.toarray().sum(axis=0)},
    index=title_bow.get_feature_names_out()
).sort_values("counts", ascending=False)

#head(20) looks at the top 20 words when ascending=False
word_counts.head(20).plot(kind="bar", figsize=(15, 5), legend=False)

plt.title("Top 20 most frequently occurring words in Job Titles".title())
plt.ylabel("Count")
plt.xticks(rotation=45) # this rotates the xlabels to make them easier to read

sns.despine()
plt.show()

In [None]:
# we can sum up the words in an array and store it in DataFrame()
word_counts = pd.DataFrame(
    {"counts": desc_train_transform.toarray().sum(axis=0)},
    index=desc_bow.get_feature_names_out()
).sort_values("counts", ascending=False)

#head(20) looks at the top 20 words when ascending=False
word_counts.head(20).plot(kind="bar", figsize=(15, 5), legend=False)

plt.title("Top 20 most frequently occurring words in Job Descriptions".title())
plt.ylabel("Count")
plt.xticks(rotation=45) # this rotates the xlabels to make them easier to read

sns.despine()
plt.show()

In [None]:
# instantiate CountVectorizer and add stop words
title_bow = CountVectorizer(stop_words='english', max_features=500, min_df = 10)
desc_bow = CountVectorizer(stop_words='english', max_features=500, min_df = 10)

# fit the model to training set
title_bow.fit(title_X_train)
desc_bow.fit(desc_X_train)

# transform the training set
title_train_transform = title_bow.transform(title_X_train)
desc_train_transform = desc_bow.transform(desc_X_train)

# transform the test set
title_test_transform = title_bow.transform(title_X_test)
desc_test_transform = desc_bow.transform(desc_X_test)

In [None]:
# we can sum up the words in an array and store it in DataFrame()
word_counts = pd.DataFrame(
    {"counts": title_train_transform.toarray().sum(axis=0)},
    index=title_bow.get_feature_names_out()
).sort_values("counts", ascending=False)

#head(20) looks at the top 20 words when ascending=False
word_counts.head(20).plot(kind="bar", figsize=(15, 5), legend=False)

plt.title("Top 20 most frequently occurring words in Job Titles Without Stop-Words".title())
plt.ylabel("Count")
plt.xticks(rotation=45) # this rotates the xlabels to make them easier to read

sns.despine()
plt.show()

In [None]:
# we can sum up the words in an array and store it in DataFrame()
word_counts = pd.DataFrame(
    {"counts": desc_train_transform.toarray().sum(axis=0)},
    index=desc_bow.get_feature_names_out()
).sort_values("counts", ascending=False)

#head(20) looks at the top 20 words when ascending=False
word_counts.head(20).plot(kind="bar", figsize=(15, 5), legend=False)

plt.title("Top 20 most frequently occurring words in Job Descriptions without Stop-Words".title())
plt.ylabel("Count")
plt.xticks(rotation=45) # this rotates the xlabels to make them easier to read

sns.despine()
plt.show()

In [None]:
# Add prefix title_ to the title columns
title_col_name = ['title_' + word for word in title_bow.get_feature_names_out()]

In [None]:
# convert title training sparse matrix into dataframes
# Source(https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sparse.from_spmatrix.html)
title_df = pd.DataFrame.sparse.from_spmatrix(title_train_transform, columns = title_col_name).sparse.to_dense()

In [None]:
# sanity check
print(title_df.shape)

In [None]:
title_df.head()

In [None]:
title_df.head()

In [None]:
# Resetting indexes
#X_train.reset_index(drop=True, inplace=True)
#y_train.reset_index(drop=True, inplace=True)
#title_df.reset_index(drop=True, inplace=True)

# Concatenating DataFrames
#new_X_train = pd.concat([X_train, title_df], axis=1) #axis=1 makes sure it adds by column and not row
#new_X_train.drop(columns = ['title','description'], inplace=True)
#new_X_train.head()

In [None]:
#new_X_train.shape

The number of rows (23,272) and columns are correct (columns = 500 + 49).

In [None]:
# convert pos and neg testing sparse matrix into dataframes
# Source(https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sparse.from_spmatrix.html)
#title_test_df = pd.DataFrame.sparse.from_spmatrix(title_test_transform, columns = title_col_name).sparse.to_dense()

In [None]:
# Resetting indexes
#X_test.reset_index(drop=True, inplace=True)
#y_test.reset_index(drop=True, inplace=True)
#title_test_df.reset_index(drop=True, inplace=True)

# Concatenating DataFrames
#new_X_test = pd.concat([X_test, title_test_df], axis=1) #axis=1 makes sure it adds by column and not row
#new_X_test.drop(columns = ['title','description'], inplace=True)

In [None]:
#from sklearn.tree import DecisionTreeClassifier

In [None]:
#scores = pd.DataFrame()

#max_depth_values = [3, 6, 9, 12]

# loop through the max depth values
#for max_depth in max_depth_values:
    #dt_model = DecisionTreeClassifier(max_depth=max_depth)
    #dt_model.fit(new_X_train, y_train)

    # scoring
    #train_score = dt_model.score(new_X_train, y_train)
    #test_score = dt_model.score(new_X_test, y_test)

    # append results
    #new_row = {'Depth': max_depth, 'Train Score': train_score, 'Test Score': test_score}
    #scores = pd.concat([scores, pd.DataFrame([new_row])], ignore_index=True)

# best parameter
#best_score = scores['Test Score'].max()
#print("Best test scores given by:")
#print(scores[scores['Test Score'] == best_score], "\n\n")

In [None]:
# comparison plot
#plt.figure(figsize=(10,5))
#plt.plot('Depth', 'Train Score', data=scores)
#plt.plot('Depth', 'Test Score', data=scores)
#plt.title('Accuracies as Depth Changes')
#plt.xlabel('Depth')
#plt.xticks(max_depth_values)
#plt.ylabel('Accuracy Score')
#plt.legend()
#plt.show()

In [None]:
#new_X_train.shape

## Random Forest Regressor

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from scipy.stats import randint,uniform

In [None]:
df_dummies

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor())
])

# Defining the hyperparameter grid to search
param_grid = {
    'rf__n_estimators': [50, 100],
    'rf__max_depth': [10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

# Defining the scoring metric (Root Mean Squared Error in this case)
scorer = make_scorer(mean_squared_error, squared=False)

# Creating the GridSearchCV object with verbose parameter
grid_search = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=3, verbose=4)

# Fitting the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred_test = best_model.predict(X_test)

In [None]:
# Accessing the best hyperparameters
best_hyperparameters = grid_search.best_params_
print("Best Hyperparameters:", best_hyperparameters)

In [None]:
# Creating the model
model = RandomForestRegressor(n_estimators=50,
                    max_depth=10,
                    min_samples_split=2,
                    min_samples_leaf=1)
# Training the model on the training data
model.fit(new_X_train, y_train)
# Making predictions on the testing set
y_pred_test = model.predict(new_X_test)
# Making predictions on the training set
y_pred_train = model.predict(new_X_train)
# Calculating R-squared
r2 = r2_score(y_train, y_pred_train)
print(f"R-squared (R^2) value: {r2}")
# Calculating R-squared
r2 = r2_score(y_test, y_pred_test)
print(f"R-squared (R^2) value: {r2}")
# Calculating MAE for training set
mae_train = mean_absolute_error(y_train, y_pred_train)
print(f"Mean Absolute Error (MAE) for TRAIN set: {mae_train}")
# Calculating MAE for test set
mae_test =mean_absolute_error(y_test, y_pred_test)
print(f"Mean Absolute Error (MAE) for TEST set: {mae_test}")
# Creating a DataFrame with actual and predicted values for the training set
train_results = pd.DataFrame({'Actual': y_train, 'Predicted': y_pred_train})
# Creating a DataFrame with actual and predicted values for the test set
test_results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test})