In [10]:
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from catboost import CatBoostClassifier, Pool
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
import re
import nltk
import spacy

In [11]:
df_indeed = pd.read_csv("data/indeed_data_scientist.csv")
df_linkedin = pd.read_csv("data/linkedin_data_scientist.csv")
df = pd.concat([df_indeed, df_linkedin], axis=0)
df.drop_duplicates(subset=["title", "company", "text"], keep="first", inplace=True)
df.dropna(subset=["text"], inplace=True)
df.reset_index(inplace=True, drop=True)

In [12]:
df_indeed_list = pd.read_csv("data/indeed_data_scientist_list.csv")
df_linkedin_list = pd.read_csv("data/linkedin_data_scientist_list.csv")
df_list = pd.concat([df_indeed_list, df_linkedin_list], axis=0)
df_list.drop_duplicates(subset=["title", "company", "text"], keep="first", inplace=True)
df_list.dropna(subset=["text"], inplace=True)
df_list.reset_index(inplace=True, drop=True)

In [31]:
# punctuation removal
import string
string.punctuation
data = pd.DataFrame()

def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data["body_text_clean"] = df["text"].apply(lambda x: remove_punct(x))

#  tokenization
def tokenize(text):
    tokens = re.split("\W+", text)
    return tokens

data["body_text_tokenized"] = data["body_text_clean"].apply(lambda x: tokenize(x.lower()))

# stopword removal
stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

data["body_text_nostop"] = data["body_text_tokenized"].apply(lambda x: remove_stopwords(x))

# stemming
ps = nltk.PorterStemmer()

def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

data["body_text_stemmed"] = data["body_text_nostop"].apply(lambda x: stemming(x))

#  lemmatize

wn = nltk.WordNetLemmatizer()

def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data["body_text_lemmatized"] = data["body_text_nostop"].apply(lambda x: lemmatizing(x))



In [133]:
stopwords = nltk.corpus.stopwords.words('english')
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
#     text = [ps.stem(word) for word in tokens if word not in stopwords]
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

cvec = CountVectorizer(stop_words='english', ngram_range=(1,4), strip_accents="unicode", max_features=200)

In [12]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)
matcher.add("OBAMA", None, nlp("Barack Obama"))
doc = nlp("Barack Obama lifts America one last time in emotional farewell")
matches = matcher(doc)

In [13]:
matches

[(7732777389095836264, 0, 2)]

In [16]:
import spacy
nlp = spacy.load("en_trf_xlnetbasecased_lg")           # load model package "en_core_web_sm"
# nlp = spacy.load("en_core_web_sm") 
# nlp = spacy.load("/path/to/en_core_web_sm")  # load package from a directory
# nlp = spacy.load("en")                       # load model with shortcut link "en"

MemoryError: 

In [2]:
doc = nlp("Larry Page founded Google")
# Text and label of named entity span
[(ent.text, ent.label_) for ent in doc.ents]
# [('Larry Page', 'PERSON'), ('Google', 'ORG')]

[('Larry Page', 'PERSON'), ('Google', 'ORG')]

In [11]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

matcher.add("OBAMA", None, nlp("Obama lifts"))
doc = nlp("Barack Obama lifts America one last time in emotional farewell")
matches = matcher(doc)
matches

[(7732777389095836264, 1, 3)]

In [4]:
import spacy
import torch
import numpy
from numpy.testing import assert_almost_equal

is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

nlp = spacy.load("en_trf_bertbaseuncased_lg")
doc = nlp("Here is some text to encode.")
assert doc.tensor.shape == (7, 768)  # Always has one row per token
doc._.trf_word_pieces_  # String values of the wordpieces
doc._.trf_word_pieces  # Wordpiece IDs (note: *not* spaCy's hash values!)
doc._.trf_alignment  # Alignment between spaCy tokens and wordpieces
# The raw transformer output has one row per wordpiece.
assert len(doc._.trf_last_hidden_state) == len(doc._.trf_word_pieces)
# To avoid losing information, we calculate the doc.tensor attribute such that
# the sum-pooled vectors match (apart from numeric error)
assert_almost_equal(doc.tensor.sum(axis=0), doc._.trf_last_hidden_state.sum(axis=0), decimal=5)
span = doc[2:4]
# Access the tensor from Span elements (especially helpful for sentences)
assert numpy.array_equal(span.tensor, doc.tensor[2:4])
# .vector and .similarity use the transformer outputs
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")
print(apple1[0].similarity(apple2[0]))  # 0.73428553
print(apple1[0].similarity(apple3[0])) 

0.7342852
0.4336572


In [1]:
import spacy
nlp = spacy.load("en_trf_xlnetbasecased_lg") 

doc1 = nlp("How do I turn sound on/off?")
doc2 = nlp("How do I obtain a pet?")
doc1.similarity(doc2)

0.9746173329890798

In [1]:
import spacy
nlp = spacy.load("en_trf_xlnetbasecased_lg")

In [1]:
TRAIN_DATA = [
    ("text1", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}})
]

import spacy
from spacy.util import minibatch
import random
import torch

is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

nlp = spacy.load("en_trf_bertbaseuncased_lg")
print(nlp.pipe_names) # ["sentencizer", "trf_wordpiecer", "trf_tok2vec"]
textcat = nlp.create_pipe("trf_textcat", config={"exclusive_classes": True})
for label in ("POSITIVE", "NEGATIVE"):
    textcat.add_label(label)
nlp.add_pipe(textcat)

optimizer = nlp.resume_training()
for i in range(10):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for batch in minibatch(TRAIN_DATA, size=8):
        texts, cats = zip(*batch)
        nlp.update(texts, cats, sgd=optimizer, losses=losses)
    print(i, losses)
nlp.to_disk("/bert-textcat")

['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']
0 {'trf_textcat': 0.5}
1 {'trf_textcat': 0.3553078770637512}
2 {'trf_textcat': 0.2451152205467224}
3 {'trf_textcat': 0.05905628576874733}
4 {'trf_textcat': 0.014026578515768051}
5 {'trf_textcat': 0.002765323268249631}
6 {'trf_textcat': 0.0006972913397476077}
7 {'trf_textcat': 0.0001908408012241125}
8 {'trf_textcat': 3.530673711793497e-05}
9 {'trf_textcat': 1.5150550098042004e-05}


In [19]:

nlp = spacy.load("en_trf_xlnetbasecased_lg") 

doc1 = nlp(df_list["text"].loc[20])
doc2 = nlp(df["text"].loc[69])
print(doc1)
print("\n")
print(doc2)
doc1.similarity(doc2)

Designs and executes innovative and robust data solutions in collaboration with internal stakeholders to support team objectives. Areas of responsibility include data architecture and infrastructure, scheduled data retrievals, data processing, scalability, dealing with various internal and external data under both big data and traditional data platforms.     Conducts preliminary analyses on data involving basic statistical approaches and provides solutions to treat various data deficiencies to support data integrity.     Leads and conducts industry, demographic, and other consumer research to understand new data sources and applications of analytics.     Focuses on innovation and process improvement to promote leading practice and efficiency across the organization.     Builds predictive models and end-to-end automated solutions for insurance cost, customer demand and other more complex problems.     Presents sophisticated statistical and machine learning models to various stakeholders

0.9944838442872223

In [21]:
for token in doc1:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

Designs True 154.30315 True
and True 67.354355 True
executes True 212.7289 True
innovative True 111.707405 True
and True 100.290184 True
robust True 98.7646 True
data True 122.82915 True
solutions True 104.98659 True
in True 141.44861 True
collaboration True 165.60202 True
with True 146.56181 True
internal True 120.14131 True
stakeholders True 123.93559 True
to True 96.71213 True
support True 113.800476 True
team True 113.8621 True
objectives True 100.63677 True
. True 81.189644 True
Areas True 178.37143 True
of True 106.5334 True
responsibility True 78.63831 True
include True 118.340576 True
data True 109.61223 True
architecture True 98.928085 True
and True 61.97479 True
infrastructure True 78.80913 True
, True 56.109295 True
scheduled True 77.667755 True
data True 104.41756 True
retrievals True 143.9374 True
, True 53.475502 True
data True 83.33658 True
processing True 55.359287 True
, True 145.27928 True
scalability True 334.43793 True
, True 124.145874 True
dealing True 104.29879 T

In [17]:
df_list["text"].loc[20]

'Designs and executes innovative and robust data solutions in collaboration with internal stakeholders to support team objectives. Areas of responsibility include data architecture and infrastructure, scheduled data retrievals, data processing, scalability, dealing with various internal and external data under both big data and traditional data platforms.     Conducts preliminary analyses on data involving basic statistical approaches and provides solutions to treat various data deficiencies to support data integrity.     Leads and conducts industry, demographic, and other consumer research to understand new data sources and applications of analytics.     Focuses on innovation and process improvement to promote leading practice and efficiency across the organization.     Builds predictive models and end-to-end automated solutions for insurance cost, customer demand and other more complex problems.     Presents sophisticated statistical and machine learning models to various stakeholder