In [1]:
import psycopg2
import pandas as pd

import re

import numpy as np
import nltk
from nltk.corpus import stopwords
# import nltk.tokenize as nt
# from nltk.tokenize import RegexpTokenizer

from textblob import blob, Blobber, TextBlob, Sentence, Word, WordList, tokenizers, sentiments, taggers, parsers
#from textblob_aptagger import PerceptronTagger

In [2]:
con = psycopg2.connect(dbname="skillsdb",host=""
                ,port="",user="", password="")
curs = con.cursor()

In [3]:
# total number of CVs

pd.read_sql_query('''select count(distinct user_id) from cv''',con)

Unnamed: 0,count
0,5049


In [4]:
data = pd.read_sql_query('''select distinct cv.user_id, cv_section_attribute.name,
cv.value_char, cv.value_timestamp from cv_section_attribute 
left join cv on cv_section_attribute.id=cv.cv_section_attribute_id''',con)

In [5]:
# data['value_char'] = data['value_char'].map(lambda x: x.strip() if pd.notnull(x) else x)

In [6]:
data_valid = data[data['name'].isin(['locale','name','summary','headline',
                                     'degree','school','admit_year','grad_year',
                                     'company',  'title',  'work_location',  
                                     'start_date','end_date', 'description',
                                     'award',
                                     'publication', 
                                     'additional_info', 
                                     'skill'])]

In [9]:
## Tokenization parameters

stopWords = stopwords.words('english')
tokenizer_prefs = {
    'tokenizer' : nltk.tokenize.PunktSentenceTokenizer(),
#     'token_format' : 'stem',
    'spell_correct' : False,
    'np_extract': None,
    'pos_tagger': None,
    'analyzer': None,
    'classifier': None, 
    'clean_html': False
}

In [10]:
def clean_data(**kwargs):
    '''
    Cleans text data by:
    1.  force lowercase
    2.  _ non-ascii chars
    3.  standardize whitespace
    4.  remove digits
    5.  remove control characters
    6.  remove URL patterns
    '''
    df = pd.DataFrame(data_valid)
    
    try:
        df['value_char'] = data_valid['value_char'].dropna().map(lambda x: "".join(i for i in x.strip().lower() if ord(i)<128))
    except UnicodeDecodeError:
        print(UnicodeDecodeError)
        df['value_char'] = data_valid['value_char'].dropna().map(lambda x: x.strip().lower())


    url_pattern = "((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?"

    re_URL = re.compile(url_pattern)
#     re_TAG = re.compile("(<[phl]>)", re.IGNORECASE)
    re_WS = re.compile("/[^\S\n]/")
#     re_DIGIT = re.compile("\d")
    re_CTRL = re.compile("[\x00-\x11\x03-\x1F]+")
    re_HI = re.compile("[\x80-\xFF]+")
    re_NWC = re.compile("[!;<>?{}\/~`#=@#$%^&*()_+]")
    
    df['value_char'] = df['value_char'].map(lambda x: re_HI.sub(' ', x) if type(x) == str else None)
    df['value_char'] = df['value_char'].map(lambda x: re_CTRL.sub(' ', x) if type(x) == str else None)
    df['value_char'] = df['value_char'].map(lambda x: re_URL.sub(' ', x) if type(x) == str else None)
#     data[prefix] = data[prefix].map(lambda x: re_DIGIT.sub(' ', x))
    df['value_char'] = df['value_char'].map(lambda x: re_WS.sub(' ', x) if type(x) == str else None)        
    df['value_char'] = df['value_char'].map(lambda x: re_NWC.sub(' ', x) if type(x) == str else None)
    

    # create a blon using TextBlob
    tokenizer = kwargs['tokenizer']
    pos_tagger = kwargs['pos_tagger']
    analyzer = kwargs['analyzer']
    classifier = kwargs['classifier']
    np_extract = kwargs['np_extract']
    
    df['value_char'] = df['value_char'].map(lambda l: TextBlob(l,
                                          tokenizer=tokenizer,
                                           np_extractor=np_extract,
                                           pos_tagger=pos_tagger,
                                           analyzer=analyzer) if l is not None else None)

    df_sentences = pd.DataFrame(df)
    # tokenize the document into sentences from blob object
    df_sentences['value_char'] = df['value_char'].map(lambda s: s.sentences if s is not None else None)
    
    df_words = pd.DataFrame(df_sentences)
    # tokenize each sentence into words
    df_words['value_char'] = df_sentences['value_char'].dropna().map(lambda l: (w.strip().words for w in l if w is not None and len(w)>1))
    
    
    return df_words

In [11]:
df = clean_data(**tokenizer_prefs)

In [165]:
# data_normalized = normalize_data(data_word_tokenized)

In [665]:
def build_vocab(target):
    vocab = set()
#     if not isinstance(df, type(None)):
    for token in df.value_char:
        if not isinstance(token, type(None)) and type(token) !=float:
            for sentence in token:
                if not isinstance(sentence, type(None)) and type(sentence) !=float:
                    for word in sentence:
#                         if not isinstance(word, type(None)) and word not in stopWords and word != '\'s' and word != '\'d':
                         if not isinstance(word, type(None)) and word != '\'s' and word != '\'d':
                            vocab.add(word)
            
    if target:
        w2i = {w: np.int32(i+2) for i, w in enumerate(vocab)}
        w2i['<s>'], w2i['</s>'] = np.int32(0), np.int32(1)
    else:
        w2i = {w: np.int32(i) for i, w in enumerate(vocab)}

    return w2i

In [666]:
w2i = build_vocab(True)

In [667]:
w2i

{'omdurmanuiversity2011': 2,
 'interviewereffective': 1018,
 ...}

In [166]:
def encode(data_normalized, w2i):
    encoded_sentence = []
    if not isinstance(data_normalized, type(None)):
        for token in data_normalized:
            if not isinstance(token, type(None)):
                for sentence in token:
                    if not isinstance(sentence, type(None)):
                        for w in sentence:
                            try:
                                encoded_sentence.append(w2i[w])
                            except Exception:
                                pass
    return encoded_sentence

In [157]:
encode(data_normalized, w2i)

[34587,
 ...]

In [167]:
def load_data(data_normalized, vocab=None, w2i=None, target=True):
    if vocab is None and w2i is None:
        w2i = build_vocab(data_normalized, target)

    s = []
    data = []
    if not isinstance(data_normalized, type(None)):
        for token in data_normalized:
            if not isinstance(token, type(None)):
                for sentence in token:
                    if not isinstance(sentence, type(None)):
                        for w in sentence:
                            s.append(w)
        if target:
            s = ['<s>'] + s + ['</s>']
        enc = encode(s, w2i)
        data.append(enc)
    i2w = {i: w for w, i in w2i.items()}
    return data, w2i, i2w

In [1]:
load_data(data_normalized,w2i=w2i,target=False)

In [120]:
data_all = []
if not isinstance(data_normalized, type(None)):
    for token in data_normalized:
        if not isinstance(token, type(None)):
            for sentence in token:
                if not isinstance(sentence, type(None)):
                    for s in sentence:
                        data_all.append(s)

In [123]:
# data_normalized = data_word_tokenized.map(lambda l: map(lambda wl: map(lambda w: nltk.stem.PorterStemmer.NLTK_EXTENSIONS(w) if w in wl and not isinstance(w, type(None)) else wl.remove(w), wl), l))

In [2]:
ree = re.compile(r'(\'\w)')
wl = list()

for i in data_normalized:
    for j in i:
        if not isinstance(j, type(None)):
            for k in j:
                for l in k:

In [None]:
ree = re.compile(r'(\'\w)')
wl = list()

for i in normalize(data_word_tokenized,**tokenizer_prefs):
    for j in i:
        if not isinstance(j, type(None)):
            for k in j:
                if not isinstance(k, type(None)):
                    if re.match(ree, k):
                        ree.sub('', k)
                    if len(k.strip().strip('.').strip(',')) > 1:
                        wl.append((k))
            

In [101]:
def removeNoneTypes(lst):
    return [i for i in lst if type(i) is not type(None)]

In [105]:
def normalize(data_word_tokenized, **kwargs):
    tokenizer = kwargs['tokenizer']
    normalizer = kwargs['token_format']
    spelling = kwargs['spell_correct']
    
    data_normalized = data_word_tokenized.map(lambda l: map(lambda wl: removeNoneTypes(wl), l))
    data_normalized = data_normalized.map(lambda l: map(lambda w: w.singularize(), l))
    
    # filter out 'bad' words, normalize good ones
    # w if w not in self.stopWords else wl.remove(w)
    data_normalized = data_normalized.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w in stopWords else w, wl), l))
    data_normalized = data_normalized.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w == '\'s' else w, wl), l))
    data_normalized = data_normalized.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w == '\'d' else w, wl), l))
 
     # remove tokens with length 1
         wl_coll = list()
         for i in normalize(data_word_tokenized,**tokenizer_prefs):
            for j in i:
                if not isinstance(j, type(None)):
                    for k in j:
                        if not isinstance(k, type(None)):
                            if re.match(ree, i):
                                ree.sub('', i)
                            if len(i.strip().strip('.').strip(',')) > 1:
                                wl.append((i))
                    wl_coll.append(WordList(wl))
            data_normalized[indx] = wl_col
            del tmp

    # stemming
    data_normalized = data_normalized.map(lambda l: map(lambda wl: map(lambda w: nltk.stemmer.stem(w) if w in wl and not isinstance(w, types.NoneType) else wl.remove(w), wl), l))

    data_word_tokenized= tokenize_words('value_char', **tokenizer_prefs)
    
    return data_word_tokenized

In [108]:
stopWords = stopwords.words('english')
tokenizer  =   RegexpTokenizer(pattern=r'\w+')
stemmer    =   nltk.stem.PorterStemmer.NLTK_EXTENSIONS
lemmatize  =   nltk.WordNetLemmatizer()

In [3]:
normalize(data_word_tokenized,**tokenizer_prefs)

In [108]:
ree = re.compile(r'(\'\w)')
wl = list()

for i in normalize(data_word_tokenized,**tokenizer_prefs):
    for j in i:
        if not isinstance(j, type(None)):
            for k in j:
                if not isinstance(k, type(None)):
                    if re.match(ree, k):
                        ree.sub('', k)
                    if len(k.strip().strip('.').strip(',')) > 1:
                        wl.append((k))
            

In [None]:


def normalize_data(**kwargs):
    tokenizer = kwargs['tokenizer']
    normalizer = kwargs['token_format']
    spelling = kwargs['spell_correct']
    aa = cleanestes.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w in stopWords else w, wl), l))
    aa = aa.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w == '\'s' else w, wl), l))
    aa = aa.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w == '\'d' else w, wl), l))
    # Stemming or lemmatization of tokens    
    if normalizer == 'stem':
        aa = aa.map(lambda l: map(lambda wl: map(lambda w: stemmer.stem(w) if w in wl and not isinstance(w, types.NoneType) else wl.remove(w), wl), l))
    aa = aa.map(lambda l: map(lambda wl: map(Word, wl), l))
    return aa

In [None]:
normalize_data(**tokenizer_prefs)