In [None]:
#importing
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import gensim
import nltk
import pickle
import re
import heapq
import spacy

from numpy import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix,  classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC  
from sklearn import preprocessing


In [None]:
from Portal import *
# pd.set_option('display.width', 400)
# pd.set_option('display.max_columns', 10)


class CareersFuture(Portal):

    def __init__(self, name):
        super().__init__(name)

    def read_data(self):
        # Reads data
        df = pd.read_csv('mycareerfuture2020-09-15.csv', error_bad_lines=False)
        return df

    def clean_data(self):
        df = self.read_data()
        # df = df[df['Job Description'].str.contains('value')]
        df['Job Description'] = df['Job Description'].astype(str)
        df['Job Description'] = df['Job Description'].apply(lambda x: self.strip_html_tags(x))

        df['Industry'] = df['Industry'].astype(str)
        df['Industry'] = df['Industry'].apply(lambda x: self.str_to_literal(x))

        # df['Skills'] = df['Skills'].astype(str)

        df['Job Experience Required (years)'] = df['Job Experience Required (years)'].astype(int)

        df['Job Monthly Min Sal'] = df['Job Monthly Min Sal'].astype(int)
        df['Job Monthly Max Sal'] = df['Job Monthly Max Sal'].astype(int)

        return df

    def str_to_literal(self, text):
        try:
            # ast.literal_eval function converts takes in a string and converts it into a dictionary
            ls = super().str_to_literal(text)
            industry = []
            for i in ls:
                industry.append(i['category'])
            return industry
        except:
            pass

careersfuture = CareersFuture("careersfuture")


In [None]:
cf = pd.DataFrame(careersfuture.clean_data())
cf['Industry_unlisted'] = cf['Industry'].apply(lambda x: ','.join(map(str, x)))
cf['Skills'] = cf['Skills'].apply(lambda x: x.replace("'", "") )
cf['Skills_unlisted'] = cf['Skills'].apply(lambda x: x.replace('[','').replace(']',''))
cf_model = cf[["Job Title", "Job Description", "Industry_unlisted", "Skills_unlisted", "Job Experience Required (years)","Job Monthly Min Sal", "Job Monthly Max Sal" ]]


In [None]:
sf_sectors = pd.read_csv("sfw_sector.csv")

In [None]:
sf_sectors['Job_Role_Replaced'] = sf_sectors['Job_Role_Replaced'].str.replace('head of ', '')

def clear_hierarchy(title):
    split = title.split()
    if (split[0] == 'assist') or (split[0]=='head'):
        final_title = " ".join(split[1:])
    else:
        final_title = title
    return final_title

sf_sectors['Job_Role_Replaced'] = sf_sectors['Job_Role_Replaced'].apply(lambda x: clear_hierarchy(x))
sf_sectors['Job_Role_Replaced'] = sf_sectors['Job_Role_Replaced'].str.replace('(specialist)', '')
sf_sectors['Job_Role_Replaced'] = sf_sectors['Job_Role_Replaced'].str.replace('specialist', '')

In [None]:
X_train_sf, X_test_sf, y_train_sf, y_test_sf = train_test_split(sf_sectors['Skill Title'], 
                                                    sf_sectors['Sector'], 
                                                    test_size = 0.03, 
                                                    shuffle = True, 
                                                    stratify = sf_sectors['Sector'], 
                                                    random_state = 3)

In [None]:
transformer = Pipeline([('vect', CountVectorizer(max_features = 5000, stop_words = 'english', min_df = 3, max_df = 0.5)),
                ('tfidf', TfidfTransformer()),
                                
                ])
X_train_dtm = transformer.fit_transform(sf_sectors['Skill Title'])

In [None]:
k_fold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 50)

In [None]:
param_grid = {'C': [0.1,1, 10], 'gamma': [1,0.1,0.01, 0.001],'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

In [None]:
model = GridSearchCV(SVC(class_weight='balanced', probability = True),param_grid,refit=True,verbose=2, n_jobs = -1, cv = 3)

for train_indices, test_indices in k_fold.split(sf_sectors['Skill Title'], sf_sectors['Sector']):
    X_train, X_test = X_train_dtm[train_indices], X_train_dtm[test_indices]
    y_train, y_test = sf_sectors['Sector'][train_indices], sf_sectors['Sector'][test_indices]
    model.fit(X_train, y_train)
    print("Best parameters set found on development set:")
    print()
    print(model.best_params_)
    print()

    y_true, y_pred = y_test, model.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()



# y_pred = logreg.predict(X_test_sf)

In [None]:
model.best_params_

In [None]:
model = SVC(C = 1, gamma = 1, kernel = 'linear', class_weight='balanced', probability=True)

In [None]:
model.fit(X_train_dtm, sf_sectors['Sector'])

In [None]:
pickle.dump(transformer, open("tfidf.pickle", "wb"))
pickle.dump(model, open("model.sav", "wb"))

In [None]:
def get_labels(data, column):
    
    le = preprocessing.LabelEncoder()
    for i in range(len(data)):
        data['Sector Label'] = le.fit_transform(data[column])
        values = sf_sectors[['Sector Label', column]].drop_duplicates().sort_values('Sector Label').reset_index().drop(columns = 'index')
        return values

In [None]:
vectorizer = pickle.load(open("tfidf.pickle", "rb"))
model = pickle.load(open('industry.sav', 'rb'))
nlp = spacy.load("en_core_web_lg")

In [None]:
def retrieve_job_title(jd, vectorizer, model, data, nlp):
    # replace skills here
    transformed_jd = vectorizer.transform(pd.Series(jd))
    probs = model.predict_proba(transformed_jd)
    best_n = np.argsort(probs, axis=1)[:,-3:]
    labels = get_labels(data, 'Sector')
    industries = labels.iloc[best_n[0]]['Sector'].to_list()
    print(industries)
    all_jobs = data[(data['Sector']== industries[0]) | (data['Sector']== industries[1]) | (data['Sector']== industries[2])]
    
    cf = nlp(str(jd))
    print("User Input:")
    print(cf)

    scores = []
    
    for j in range(0, len(all_jobs)):
        sfw = nlp(str(all_jobs.iloc[j]['Skill Title']))
        scores.append(sfw.similarity(cf))
      
        
    ind1, ind2, ind3, ind4, ind5 = heapq.nlargest(5, zip(scores, all_jobs['Job_Role_Replaced']))
    print(ind1)
    print(ind2)
    print(ind3)

In [None]:
cf_model.iloc[2000]

In [None]:
cf_model.iloc[2000]['Skills_unlisted']

In [None]:
cf_model.iloc[2000]['Job Description']

In [None]:
retrieve_job_title(pd.Series(cf_model.iloc[2000]['Skills_unlisted']), vectorizer, model, sf_sectors, nlp)
#need to use skills 
#not the same sector but close job title from what it seems like based on jd

In [None]:
retrieve_job_title(pd.Series(cf_model.iloc[200]['Skills_unlisted']), vectorizer, model, sf_sectors, nlp)

In [None]:
sf_sectors['Sector'].unique()