In [1]:
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from catboost import CatBoostClassifier, Pool
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
import re
import nltk
import spacy
nlp = spacy.load("en_trf_xlnetbasecased_lg")

In [24]:
df_indeed_ds = pd.read_csv("indeed_data_scientist.csv")
df_linkedin_ds = pd.read_csv("linkedin_data_scientist.csv")

In [25]:
df = pd.concat([df_indeed_ds, df_linkedin_ds], axis=0)
df.drop_duplicates(subset=["title", "company", "text"], keep="first", inplace=True)
df.dropna(subset=["text"], inplace=True)
df.reset_index(inplace=True, drop=True)

In [4]:
tv = TfidfVectorizer(stop_words='english', max_features=50, ngram_range=(1, 2), token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b', lowercase=False,)

trans = tv.fit_transform(df["text"])

In [5]:
tv = TfidfVectorizer(stop_words='english', max_features=300, ngram_range=(2, 3), lowercase=True)

trans = tv.fit_transform(df["text"])

In [6]:
df_idf = pd.DataFrame(tv.idf_, index=tv.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df_idf.sort_values(by=['idf_weights'], ascending=False)

Unnamed: 0,idf_weights
thomson reuters,5.354876
globe mail,5.354876
business group,4.815879
supply chain,4.528197
make decisions easier,4.410414
...,...
data scientists,2.164987
data science,2.164987
years experience,2.147494
computer science,1.743958


In [7]:
df_idf[df_idf.index == "python"]

Unnamed: 0,idf_weights


In [8]:
df_idf.sort_values(by=["idf_weights"]).head(35)

Unnamed: 0,idf_weights
machine learning,1.691314
computer science,1.743958
years experience,2.147494
data science,2.164987
data scientists,2.164987
communication skills,2.176822
big data,2.200919
experience working,2.213189
data scientist,2.25093
equal opportunity,2.410437


In [31]:
# punctuation removal
import string
string.punctuation
data = pd.DataFrame()

def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data["body_text_clean"] = df["text"].apply(lambda x: remove_punct(x))

#  tokenization
def tokenize(text):
    tokens = re.split("\W+", text)
    return tokens

data["body_text_tokenized"] = data["body_text_clean"].apply(lambda x: tokenize(x.lower()))

# stopword removal
stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

data["body_text_nostop"] = data["body_text_tokenized"].apply(lambda x: remove_stopwords(x))

# stemming
ps = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

data["body_text_stemmed"] = data["body_text_nostop"].apply(lambda x: stemming(x))

#  lemmatize

wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data["body_text_lemmatized"] = data["body_text_nostop"].apply(lambda x: lemmatizing(x))



In [32]:
data.head()

Unnamed: 0,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed,body_text_lemmatized
0,161 Bay Street 93021 Canada TorontoToronto Ont...,"[161, bay, street, 93021, canada, torontotoron...","[161, bay, street, 93021, canada, torontotoron...","[161, bay, street, 93021, canada, torontotoron...","[161, bay, street, 93021, canada, torontotoron..."
1,Employer Loyalty One Position Senior Data Scie...,"[employer, loyalty, one, position, senior, dat...","[employer, loyalty, one, position, senior, dat...","[employ, loyalti, one, posit, senior, data, sc...","[employer, loyalty, one, position, senior, dat..."
2,The Trust Science Solutions team engages in cl...,"[the, trust, science, solutions, team, engages...","[trust, science, solutions, team, engages, cli...","[trust, scienc, solut, team, engag, client, ac...","[trust, science, solution, team, engages, clie..."
3,Tell us your story Dont go unnoticed Explain w...,"[tell, us, your, story, dont, go, unnoticed, e...","[tell, us, story, dont, go, unnoticed, explain...","[tell, us, stori, dont, go, unnot, explain, yo...","[tell, u, story, dont, go, unnoticed, explain,..."
4,Kitchener 93018 Canada KitchenerKitchener ON ...,"[kitchener, 93018, canada, kitchenerkitchener,...","[kitchener, 93018, canada, kitchenerkitchener,...","[kitchen, 93018, canada, kitchenerkitchen, cap...","[kitchener, 93018, canada, kitchenerkitchener,..."


In [133]:
stopwords = nltk.corpus.stopwords.words('english')
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
#     text = [ps.stem(word) for word in tokens if word not in stopwords]
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

cvec = CountVectorizer(stop_words='english', ngram_range=(1,4), strip_accents="unicode", max_features=200)

In [134]:
cvec.fit(df["text"]) # count vectorizer learns the vocabulary of the corpus
matrix_corpus = cvec.fit_transform(df["text"])
matrix_corpus.todense()
df_cvec = pd.DataFrame(matrix_corpus.todense(),
                   columns=cvec.get_feature_names(),
                   index=df.index)

# df.T.sort_values(', ascending=False).head(10).T

In [135]:
df_cvec.columns

Index(['000', 'ability', 'accommodation', 'advanced', 'ai', 'algorithms',
       'analysis', 'analytical', 'analytics', 'applicants',
       ...
       'understand', 'understanding', 'use', 'using', 'value', 'work',
       'working', 'world', 'years', 'years experience'],
      dtype='object', length=200)

In [49]:
df_cvec.T.sort_values(df_cvec.columns.tolist(), axis=1, ascending=False).T

Unnamed: 0,Unnamed: 1,0,000,03062020,03092020,04,046184,09,09032019,09092019,...,étape,état,étiquet,évolut,évolué,événement,ête,êtesvou,être,œuvr
225,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
377,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
183,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
205,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
403,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
442,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [109]:
import numpy as np

In [136]:
rel = df[["text"]].loc[150].apply(lambda x: clean_text(x))
# rel = rel.apply(lambda x: clean_text(x))

In [137]:
rel_cvec = cvec.transform(rel[0])
rel_cvec.todense()
rel_cvec = pd.DataFrame(rel_cvec.todense(),
                   columns=cvec.get_feature_names())

In [141]:
# rel_cvec
rel_cvec[rel_cvec == 0]

Unnamed: 0,000,ability,accommodation,advanced,ai,algorithms,analysis,analytical,analytics,applicants,...,understand,understanding,use,using,value,work,working,world,years,years experience
0,0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0
1,0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0
2,0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0
3,0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0
4,0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,,0.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0
383,0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0
384,0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0
385,0,0,0,0,0.0,0,0,0,0,0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0


In [12]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)
matcher.add("OBAMA", None, nlp("Barack Obama"))
doc = nlp("Barack Obama lifts America one last time in emotional farewell")
matches = matcher(doc)

In [13]:
matches

[(7732777389095836264, 0, 2)]

In [16]:
import spacy
nlp = spacy.load("en_trf_xlnetbasecased_lg")           # load model package "en_core_web_sm"
# nlp = spacy.load("en_core_web_sm") 
# nlp = spacy.load("/path/to/en_core_web_sm")  # load package from a directory
# nlp = spacy.load("en")                       # load model with shortcut link "en"

MemoryError: 

In [2]:
doc = nlp("Larry Page founded Google")
# Text and label of named entity span
[(ent.text, ent.label_) for ent in doc.ents]
# [('Larry Page', 'PERSON'), ('Google', 'ORG')]

[('Larry Page', 'PERSON'), ('Google', 'ORG')]

In [11]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

matcher.add("OBAMA", None, nlp("Obama lifts"))
doc = nlp("Barack Obama lifts America one last time in emotional farewell")
matches = matcher(doc)
matches

[(7732777389095836264, 1, 3)]

In [12]:
import spacy
import torch
import numpy
from numpy.testing import assert_almost_equal

is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

nlp = spacy.load("en_trf_bertbaseuncased_lg")
doc = nlp("Here is some text to encode.")
assert doc.tensor.shape == (7, 768)  # Always has one row per token
doc._.trf_word_pieces_  # String values of the wordpieces
doc._.trf_word_pieces  # Wordpiece IDs (note: *not* spaCy's hash values!)
doc._.trf_alignment  # Alignment between spaCy tokens and wordpieces
# The raw transformer output has one row per wordpiece.
assert len(doc._.trf_last_hidden_state) == len(doc._.trf_word_pieces)
# To avoid losing information, we calculate the doc.tensor attribute such that
# the sum-pooled vectors match (apart from numeric error)
assert_almost_equal(doc.tensor.sum(axis=0), doc._.trf_last_hidden_state.sum(axis=0), decimal=5)
span = doc[2:4]
# Access the tensor from Span elements (especially helpful for sentences)
assert numpy.array_equal(span.tensor, doc.tensor[2:4])
# .vector and .similarity use the transformer outputs
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")
print(apple1[0].similarity(apple2[0]))  # 0.73428553
print(apple1[0].similarity(apple3[0])) 

0.73428535
0.43365782


In [1]:
import spacy
nlp = spacy.load("en_trf_xlnetbasecased_lg") 

doc1 = nlp("How do I turn sound on/off?")
doc2 = nlp("How do I obtain a pet?")
doc1.similarity(doc2)

0.9746173329890798

In [14]:
TRAIN_DATA = [
    ("text1", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}})
]

import spacy
from spacy.util import minibatch
import random
import torch

is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

nlp = spacy.load("en_trf_bertbaseuncased_lg")
print(nlp.pipe_names) # ["sentencizer", "trf_wordpiecer", "trf_tok2vec"]
textcat = nlp.create_pipe("trf_textcat", config={"exclusive_classes": True})
for label in ("POSITIVE", "NEGATIVE"):
    textcat.add_label(label)
nlp.add_pipe(textcat)

optimizer = nlp.resume_training()
for i in range(10):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for batch in minibatch(TRAIN_DATA, size=8):
        texts, cats = zip(*batch)
        nlp.update(texts, cats, sgd=optimizer, losses=losses)
    print(i, losses)
nlp.to_disk("/bert-textcat")

['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']
0 {'trf_textcat': 0.5}
1 {'trf_textcat': 0.36786872148513794}
2 {'trf_textcat': 0.4450460374355316}
3 {'trf_textcat': 0.09818868339061737}
4 {'trf_textcat': 0.019461100921034813}
5 {'trf_textcat': 0.004254724830389023}
6 {'trf_textcat': 0.0010585092240944505}
7 {'trf_textcat': 0.00028714665677398443}
8 {'trf_textcat': 7.335394184337929e-05}
9 {'trf_textcat': 2.364560350542888e-05}


PermissionError: [Errno 13] Permission denied: '/bert-textcat'