In [58]:
import time
import nltk
from nltk.corpus import stopwords

#NLP
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.collocations import *
import string, re
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

#Model Data Prep
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

#Machine Learning
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

#Deep learning
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.convolutional import MaxPooling1D, MaxPooling2D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from keras.layers import Flatten
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier #to use Keras in sklearn

#Deep Learning - Mixed inputs
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Activation
from keras.layers.core import Dropout
from keras.layers.core import Dense
from keras.layers import Flatten
from keras.layers import Input
from keras.models import Model
from keras.layers import concatenate
from keras.optimizers import Adam, RMSprop

#Save Model
import pickle

import matplotlib.pyplot as plt

from pprint import pprint
from time import time
import logging

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [28]:
df = pd.read_csv("./data/final_only_salary_us_data_jobs.csv")

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1520 entries, 0 to 1519
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      1520 non-null   int64  
 1   job_title                       1520 non-null   object 
 2   salary                          1520 non-null   object 
 3   company                         1520 non-null   object 
 4   location                        1520 non-null   object 
 5   is_remote                       530 non-null    object 
 6   job_rating                      857 non-null    float64
 7   job_summary                     1520 non-null   object 
 8   post_date                       1520 non-null   object 
 9   extract_date                    1520 non-null   object 
 10  job_url                         1520 non-null   object 
 11  rate_by                         1520 non-null   object 
 12  min                             15

In [37]:
cut_bins = [0,50000,70000,90000,120000,150000, 300000, 600000]
df['salary_bins'] = pd.cut(df['yearly_adjusted_salary'],
                                         bins=cut_bins, 
                                         labels = False)

In [38]:
df['salary_bins']

0       0
1       5
2       3
3       4
4       2
       ..
1515    3
1516    0
1517    0
1518    1
1519    2
Name: salary_bins, Length: 1520, dtype: int64

In [39]:
def clean_word(job_descriptions):
    jd_data=[]
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    stopwords_list = stopwords.words('english')
    stopwords_list += list(string.punctuation)
    stopwords_list += ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    
    for jd in job_descriptions:
        jd_tokens_raw = nltk.regexp_tokenize(jd, pattern)
        jd_tokens=[word.lower() for word in jd_tokens_raw]
        jd_words_stopped = [word for word in jd_tokens if word not in stopwords_list]
        jd_data.append(jd_words_stopped)
    return pd.Series(jd_data)

In [41]:
text = df['combined_text']
target_class = df['salary_bins'].astype('category')
X_train, X_test, y_train, y_test = train_test_split(text, target_class, test_size=0.1, random_state=4)

In [42]:
#This works for individual job description, but how to save the model? 
tdidf = TfidfVectorizer(sublinear_tf = True, min_df=0.01, max_df=0.5, ngram_range=(1,3), stop_words='english')
fitted_vectorizer = tdidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

model = SGDClassifier(alpha=0.0001, max_iter=500, n_jobs=3).fit(tfidf_vectorizer_vectors, y_train)

In [43]:
#save the vectorizer to disk

pickle.dump(fitted_vectorizer, open("./models/fitted_vectorizer.pickle", "wb"))

In [44]:
#save the machine learning model to disk

filename='finalized_model_2.h5'
pickle.dump(model, open('./models/'+filename, 'wb'))

In [46]:
#save the machine model using joblib

from joblib import dump, load
dump(model, './models/finalized_model_2.joblib')
clf = load('./models/finalized_model_2.joblib')

In [49]:
job = """The Simons Foundation is a private foundation established in 1994 in New York City by Jim and Marilyn Simons. With an annual grants and programs budget of $300 million, the foundation’s mission is to advance the frontiers of research in mathematics and the basic sciences.

The foundation pursues its mission through its grant-making division, comprising programs in Mathematics & Physical Sciences, Life Sciences, Education & Outreach and autism research, and through its internal research division, the Flatiron Institute.
POSITION SUMMARY

Spectrum ’s award-winning news team is looking for a data analysis intern for the summer.

Spectrum is a web-based autism news site intended for scientists, although we are also read by

many non-scientists.

This is an excellent opportunity for someone comfortable handling large amounts of data and interested in data analysis to gain experience in a fast-paced newsroom.

The internship is 18 hours per week, located at our office in Manhattan. This temporary internship is expected to last for 12 weeks. We offer hands-on mentoring in a supportive and fun work environment.

POTENTIAL FUNCTIONS/RESPONSIBILITIES

Sort, filter and analyze a diverse array of datasets
Clean and prepare datasets for analysis
Fact-check and document data analyses to ensure that they're accurate and reproducible
Create clear, engaging data visualizations using chart-making tools
Brainstorm data visualizations to complement news stories and features
Identify useful data for future stories
Generate story ideas from publicly available data
Research, report and write a data-driven story for your portfolio
Perform any other duties or tasks as assigned or required.
MINIMUM QUALIFICATIONS

Education

A degree or coursework in journalism or statistics is desired but not required
Experience and other requirements

Experience filtering, sorting, and aggregating data
Strong attention to detail
Comfortable with learning new technologies or software
Proficient in Microsoft Excel and/or Google Sheets
Knowledge of R, Python, HTML/CSS, JavaScript, and/or SQL are helpful
"""

In [50]:
#load the model from disk

tfidf_vectorizer = pickle.load(open('./models/fitted_vectorizer.pickle','rb'))
result = clf.predict(tfidf_vectorizer.transform([job]))

In [53]:
labels = ['0 - 50K', '50K - 70K', '70K - 90K', '90K - 120K', '120K - 150K', '150K - 300K', '300K - 600K']

In [54]:
labels[result[0]]

'50K - 70K'

In [59]:

SGD = Pipeline([
    ('vect', CountVectorizer(max_df=0.5, min_df=0.01, ngram_range=(1,3), stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(alpha=0.0001, max_iter=500, n_jobs=3))
])

In [60]:
SGD.fit(X_train, y_train)


Pipeline(steps=[('vect',
                 CountVectorizer(max_df=0.5, min_df=0.01, ngram_range=(1, 3),
                                 stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(max_iter=500, n_jobs=3))])