In [19]:
import pandas as pd

data = pd.read_table('data-input/indeed_ml_dataset/train.tsv')
data = data.fillna('')
data.head(5)

Unnamed: 0,tags,description
0,licence-needed supervising-job 5-plus-years-ex...,THE COMPANY Employer is a midstream service...
1,2-4-years-experience-needed salary full-time-job,ICR Staffing is now accepting resumes for Indu...
2,part-time-job,This is a great position for the right person....
3,licence-needed,A large multi-specialty health center is expan...
4,5-plus-years-experience-needed full-time-job b...,JOB PURPOSE: The Account Director is respon...


In [20]:
import numpy as np

possible_tags = ['part-time-job', 'full-time-job', 'hourly-wage', 'salary', 
                 'associate-needed', 'bs-degree-needed', 'ms-or-phd-needed',
                 'licence-needed', '1-year-experience-needed', '2-4-years-experience-needed',
                 '5-plus-years-experience-needed', 'supervising-job']

Y = []
for tags_str in data.iloc[:, 0]:
    tags = tags_str.split(' ')
    
    tags_vector = [0] * len(possible_tags)
    for idx, possible_tag in enumerate(possible_tags):
        if possible_tag in tags:
            tags_vector[idx] = 1
    Y.append(tags_vector)
Y = np.array(Y)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import EnglishStemmer

cv = CountVectorizer(stop_words='english', ngram_range=(1,2))

stemmer = EnglishStemmer()
analyzer = cv.build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

cv.set_params(analyzer=stemmed_words)


descriptions = data.iloc[:, 1]
X = cv.fit_transform(descriptions)

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(3500, 366931) (875, 366931) (3500, 12) (875, 12)


In [29]:
# fit model using training data
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

model = Pipeline([('chi2', SelectKBest(chi2, k=10000)), ('bc', BaggingClassifier(n_estimators=10))])
ovr = OneVsRestClassifier(model)

parameters = {
}
labeler = GridSearchCV(ovr, parameters, scoring='f1_micro')
labeler.fit(X_train,Y_train)
print(labeler.best_params_)

{}


In [28]:
from sklearn.metrics import precision_recall_fscore_support
Y_pred = labeler.predict_proba(X_test) >= 0.35
print(precision_recall_fscore_support(Y_test, Y_pred))
print(precision_recall_fscore_support(Y_test, Y_pred, average='micro'))

(array([ 0.35294118,  0.46892655,  0.57446809,  0.67114094,  0.61290323,
        0.77884615,  0.5       ,  0.38983051,  0.42307692,  0.49285714,
        0.4527027 ,  0.42346939]), array([ 0.21052632,  0.45355191,  0.66666667,  0.71942446,  0.46341463,
        0.92045455,  0.125     ,  0.45098039,  0.171875  ,  0.64485981,
        0.57758621,  0.58450704]), array([ 0.26373626,  0.46111111,  0.61714286,  0.69444444,  0.52777778,
        0.84375   ,  0.2       ,  0.41818182,  0.24444444,  0.55870445,
        0.50757576,  0.49112426]), array([ 57, 183,  81, 139,  41, 176,  24, 102,  64, 214, 116, 142], dtype=int64))
(0.53033401499659172, 0.58103061986557136, 0.55452601568068427, None)


In [34]:
targets = pd.read_table('data-input/indeed_ml_dataset/test.tsv')

descriptions = targets.iloc[:, 0]
X_targets = cv.transform(descriptions)
print(X_targets.shape)

(2921, 10000)


In [35]:
Y_pred2 = labeler.predict_proba(X_targets) >= 0.375
print(Y_pred2.shape)

(2921, 12)


In [36]:
output_file = open('data-output/tags.tsv', 'w')
output_file.write('tags\n')

for tags_vector in Y_pred2:
    tags_arr = np.array(possible_tags)[tags_vector.nonzero()]
    tags_list = tags_arr.tolist()
    output_file.write(' '.join(tags_list) + '\n')

output_file.close()