In [1]:
import psycopg2
import pandas as pd

# import types

import re

import nltk
from nltk.corpus import stopwords
import nltk.tokenize as nt
from nltk.tokenize import RegexpTokenizer

from textblob import blob, Blobber, TextBlob, Sentence, Word, WordList, tokenizers, sentiments, taggers, parsers, classifiers
#from textblob_aptagger import PerceptronTagger

In [2]:
con = psycopg2.connect(dbname="skillsdb",host="dw-instance.cbrlhmbtfrqg.eu-west-2.redshift.amazonaws.com"
                ,port="5439",user="masteruser", password="Ehgh1363")
curs = con.cursor()

In [3]:
data = pd.read_sql_query('''select distinct cv.user_id, cv_section_attribute.name,
cv.value_char, cv.value_timestamp from cv_section_attribute 
left join cv on cv_section_attribute.id=cv.cv_section_attribute_id''',con)

In [4]:
# data['value_char'] = data['value_char'].map(lambda x: x.strip() if pd.notnull(x) else x)

In [5]:
data.head()

Unnamed: 0,user_id,name,value_char,value_timestamp
0,204057900000.0,locale,Mitcham,NaT
1,207998000000.0,name,Guest Lecturer,NaT
2,207998000000.0,locale,Crewe,NaT
3,292814700000.0,name,Pencari Kerja,NaT
4,272534600000.0,summary,An ambitious and hardworking individual who is...,NaT


In [6]:
data_valid = data[data['name'].isin(['locale','name','summary','headline',
                                     'degree','school','admit_year','grad_year',
                                     'company',  'title',  'work_location',  
                                     'start_date','end_date', 'description',
                                     'award',
                                     'publication', 
                                     'additional_info', 
                                     'skill'])]
data_valid.head()

Unnamed: 0,user_id,name,value_char,value_timestamp
0,204057900000.0,locale,Mitcham,NaT
1,207998000000.0,name,Guest Lecturer,NaT
2,207998000000.0,locale,Crewe,NaT
3,292814700000.0,name,Pencari Kerja,NaT
4,272534600000.0,summary,An ambitious and hardworking individual who is...,NaT


In [7]:
def clean_text(prefix=None):
    '''
    Cleans text data by:
    1.  force lowercase
    2.  remove non-ascii chars
    3.  standardize whitespace
    4.  remove digits
    5.  remove control characters
    6.  remove URL patterns
    '''
    df = pd.DataFrame()
    
    try:
        df = data_valid[prefix].dropna().map(lambda x: "".join(i for i in x.strip().lower() if ord(i)<128))
    except UnicodeDecodeError:
        print(UnicodeDecodeError)
        df = data_valid[prefix].dropna().map(lambda x: x.strip().lower())

        #     except Exception:
#         print(Exception)
#     finally:
#         data[prefix]= data[prefix].map(lambda x: x.lower())

    url_pattern = "((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?"

    re_URL = re.compile(url_pattern)
#     re_TAG = re.compile("(<[phl]>)", re.IGNORECASE)
    re_WS = re.compile("/[^\S\n]/")
#     re_DIGIT = re.compile("\d")
    re_CTRL = re.compile("[\x00-\x11\x03-\x1F]+")
    re_HI = re.compile("[\x80-\xFF]+")
    re_NWC = re.compile("[!;<>?{}\/~`#=@#$%^&*()_+]")
    
    df = df.map(lambda x: re_HI.sub(' ', x))
    df = df.map(lambda x: re_CTRL.sub(' ', x))
    df = df.map(lambda x: re_URL.sub(' ', x))
#     data[prefix] = data[prefix].map(lambda x: re_DIGIT.sub(' ', x))
    df = df.map(lambda x: re_WS.sub(' ', x))        
    df = df.map(lambda x: re_NWC.sub(' ', x))
    df = df.map(lambda x: x.strip())
    
    return df

In [8]:
clean_text('value_char').head()

0                                              mitcham
1                                       guest lecturer
2                                                crewe
3                                        pencari kerja
4    an ambitious and hardworking individual who is...
Name: value_char, dtype: object

In [9]:
data_clean = pd.DataFrame()
data_clean['value_char'] = clean_text('value_char')
data_clean.head()

Unnamed: 0,value_char
0,mitcham
1,guest lecturer
2,crewe
3,pencari kerja
4,an ambitious and hardworking individual who is...


In [10]:
# data_valid = data_valid.reset_index(drop=True)

In [12]:
## Tokenization

# initialize the tokenizer

tokenizer = nltk.tokenize.PunktSentenceTokenizer()

# tokenize data

data_clean['value_char'] = data_clean['value_char'].map(lambda x: tokenizer.tokenize(x.strip()))
data_clean.head()

Unnamed: 0,value_char
0,[lecturer in project management]
1,[a creative and well-rounded graduate with a f...
2,[part time lecturing and lab demonstrator]
3,"[leicestershire, uk]"
5,[natural products research assistant]


In [100]:
data_clean.value_char[101]

'opening and closing procedures'

In [13]:
# word_tokenizer = nltk.tokenize.word_tokenize()
# data_clean['value_char'] = data_clean['value_char'].map(lambda x: nltk.tokenize.word_tokenize(x))

In [14]:
## Stemming

# initialize the stemmer

stemmer = nltk.stem.PorterStemmer()
# data_clean['value_char'].map(lambda x: (i for i in x))

In [15]:
# lemmatize = nltk.WordNetLemmatizer

In [17]:
cv_headlines = data[data['name'] == 'headline'].reset_index()[['user_id','value_char']]
cv_degrees = data[data['name'] == 'degree'].reset_index()[['user_id','value_char']]
cv_schools = data[data['name'] == 'school'].reset_index()[['user_id','value_char']]
cv_locales = data[data['name'] == 'locale'].reset_index()[['user_id','value_char']]
cv_summaries = data[data['name'] == 'summary'].reset_index()[['user_id','value_char']]

In [11]:
stopWords = stopwords.words('english')
tokenizer_prefs = {
    'tokenizer' : nltk.tokenize.PunktSentenceTokenizer(),
    'token_format' : 'stem',
    'spell_correct' : False,
    'np_extract': None,
    'pos_tagger': None,
    'analyzer': None,
    'classifier': None, 
    'clean_html': False
}

In [12]:
def create_blob(prefix, **kwargs):
    
    tokenizer = kwargs['tokenizer']
    pos_tagger = kwargs['pos_tagger']
    analyzer = kwargs['analyzer']
    classifier = kwargs['classifier']
    np_extract = kwargs['np_extract']
    
    blob = data_clean[prefix].map(lambda l: TextBlob(l,
                                          tokenizer=tokenizer,
                                           np_extractor=np_extract,
                                           pos_tagger=pos_tagger,
                                           analyzer=analyzer,
                                           classifier=classifier))
    return blob

In [13]:
data_blobs = pd.DataFrame()
data_blobs['value_char'] = create_blob('value_char', **tokenizer_prefs)
data_blobs.value_char[0]

TextBlob("mitcham")

In [14]:
def tokenize_sentences(prefix, **kwargs):
    tokenizer = kwargs['tokenizer']
    normalizer = kwargs['token_format']

    # tokenize the document into sentences from blob object
    sentences = data_blobs[prefix].map(lambda s: s.sentences)

    return sentences

In [15]:
data_sentence_tokenized = pd.DataFrame()
data_sentence_tokenized['value_char'] = tokenize_sentences('value_char', **tokenizer_prefs)
data_sentence_tokenized.value_char[0]

[Sentence("mitcham")]

In [16]:
def tokenize_words(prefix, normalize = 'stem', **kwargs):
    tokenizer = kwargs['tokenizer']
    normalizer = kwargs['token_format']

    # tokenize each sentence into words
    # trim token whitespaces
    # eliminate tokens of character length 1
    #words = self.data[prefix].map(lambda w: w.strip().tokens if len(w)>1 else None)

    words = data_sentence_tokenized[prefix].map(lambda l: map(lambda w: w.strip().tokens if len(w)>1 else None, l))

    return words

In [54]:
# data_word_tokenized = pd.DataFrame()
data_word_tokenized = tokenize_words('value_char', **tokenizer_prefs)

In [55]:
def removeNoneTypes(lst):
    return [i for i in lst if type(i) is not type(None)]

In [95]:
def normalize(data_word_tokenized, **kwargs):
    tokenizer = kwargs['tokenizer']
    normalizer = kwargs['token_format']
    spelling = kwargs['spell_correct']
    
    data = self.data[prefix].map(lambda l: map(lambda wl: self.removeNoneTypes(wl), l))
    data_normalized = data_word_tokenized.map(lambda l: map(lambda w: w.singularize(), l))
    
    # filter out 'bad' words, normalize good ones
    # w if w not in self.stopWords else wl.remove(w)
    data_normalized = data_normalized.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w in stopWords else w, wl), l))
    data_normalized = data_normalized.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w == '\'s' else w, wl), l))
    data_normalized = data_normalized.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w == '\'d' else w, wl), l))
 
     # remove tokens with length 1
#     ree = re.compile(r'(\'\w)')
#     rlen = len(data_normalized)
#     tmp = data_normalized.copy()
# #     tmp.reset_index(drop=True)
#     for indx in tmp.index:
#         wl_coll = list()
#         for lst in tmp[indx]:
#             wl = list()
#             for word in lst:
#                     for i in word:
#                             if re.match(ree, i):
#                                 ree.sub('', i)
#                             if len(i.strip().strip('.').strip(',')) > 1:
#                                 wl.append((i))
#             wl_coll.append(WordList(wl))
#         data_normalized[indx] = wl_coll
#     del tmp

    # stemming
    data_normalized = data_normalized.map(lambda l: map(lambda wl: map(lambda w: nltk.stemmer.stem(w) if w in wl and not isinstance(w, types.NoneType) else wl.remove(w), wl), l))

    data_word_tokenized= tokenize_words('value_char', **tokenizer_prefs)
    
    return data_word_tokenized

In [96]:
stopWords = stopwords.words('english')
tokenizer  =   RegexpTokenizer(pattern=r'\w+')
stemmer    =   nltk.stem.PorterStemmer.NLTK_EXTENSIONS
lemmatize  =   nltk.WordNetLemmatizer()

In [97]:
normalize(data_word_tokenized,**tokenizer_prefs)

0        <map object at 0x7fbaacbec198>
1        <map object at 0x7fbaacbec208>
2        <map object at 0x7fbaacbec278>
3        <map object at 0x7fbaacbec320>
4        <map object at 0x7fbaacbec3c8>
6        <map object at 0x7fbaacbec470>
7        <map object at 0x7fbaacbec518>
8        <map object at 0x7fbaacbec5c0>
9        <map object at 0x7fbaacbec668>
10       <map object at 0x7fbaacbec710>
11       <map object at 0x7fbaacbec7b8>
12       <map object at 0x7fbaacbec860>
13       <map object at 0x7fbaacbec908>
14       <map object at 0x7fbaacbec9b0>
18       <map object at 0x7fbaacbeca58>
19       <map object at 0x7fbaacbecb00>
21       <map object at 0x7fbaacbecba8>
22       <map object at 0x7fbaacbecc50>
23       <map object at 0x7fbaacbeccf8>
24       <map object at 0x7fbaacbecda0>
25       <map object at 0x7fbaacbece48>
29       <map object at 0x7fbaacbecef0>
30       <map object at 0x7fbaacbecf98>
31       <map object at 0x7fbaacbee080>
32       <map object at 0x7fbaacbee128>


In [93]:
ree = re.compile(r'(\'\w)')
wl = list()

for i in normalize(data_word_tokenized,**tokenizer_prefs):
    for j in i:
        if not isinstance(j, type(None)):
            for k in j:
                if not isinstance(k, type(None)):
                    if re.match(ree, k):
                        ree.sub('', k)
                    if len(k.strip().strip('.').strip(',')) > 1:
                        wl.append((k))
            

In [94]:
wl

['mitcham',
 'guest',
 'lecturer',
 'crewe',
 'pencari',
 'kerja',
 'an',
 'ambitious',
 'and',
 'hardworking',
 'individual',
 'who',
 'is',
 'motivated',
 'by',
 'challenge',
 'and',
 'is',
 'passionate',
 'to',
 'be',
 'successful',
 'inventor',
 'on',
 'four',
 'patents',
 'excellent',
 'communicator',
 'strong',
 'planning',
 'organisational',
 'problem',
 'solving',
 'skills',
 'with',
 'the',
 'ability',
 'to',
 'successfully',
 'analyse',
 'and',
 'assimilate',
 'large',
 'amounts',
 'of',
 'complex',
 'and',
 'disparate',
 'information',
 'good',
 'time',
 'management',
 'enjoys',
 'working',
 'under',
 'pressure',
 'and',
 'to',
 'deadlines',
 'either',
 'individually',
 'or',
 'as',
 'part',
 'of',
 'team',
 'conversational',
 'german',
 'part',
 'time',
 'lecturing',
 'and',
 'lab',
 'demonstrator',
 'project',
 'evaluation',
 'and',
 'responsible',
 'innovation',
 'intern',
 'computer',
 'skills-',
 'good',
 'knowledge',
 'of',
 'it',
 'and',
 'confident',
 'in',
 'using',

In [None]:


def normalize_data(**kwargs):
    tokenizer = kwargs['tokenizer']
    normalizer = kwargs['token_format']
    spelling = kwargs['spell_correct']
    
#     data = pd.DataFrame(['asc','asda','asdasdasd'], columns=['value_char'])
    
    # singularize tokens
#     data = data[prefix].map(lambda l: map(lambda w: w.singularize(), l))

    # Spell correct flag
    # REALLY SHOULD NEVER BE USED
#     if spelling:
#         print("Spell Correction Invoked.....")
#         data[prefix] = data[prefix].map(lambda l: map(lambda wl: map(lambda w: w.correct(), wl), l))
#         print(data[prefix].map(lambda l: map(lambda w: type(w), l)))

    # filter out 'bad' words, normalize good ones
    # w if w not in self.stopWords else wl.remove(w)
    aa = cleanestes.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w in stopWords else w, wl), l))
    aa = aa.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w == '\'s' else w, wl), l))
    aa = aa.map(lambda l: map(lambda wl: map(lambda w: wl.remove(w) if w == '\'d' else w, wl), l))

    # remove tokens with length 1
#     ree = re.compile(r'(\'\w)')
#     rlen = len(data)
#     tmp = data[prefix].copy()
#     for index in range(0,rlen):
#         wl_coll = list()
#         for lst in tmp[index]:
#             wl = list()
#             for word in lst:
#                 if not isinstance(word, types.NoneType):
#                     if re.match(ree, word):
#                         ree.sub('', word)
#                     if len(word.strip().strip('.').strip(',')) > 1:
#                         wl.append((word))
#             wl_coll.append(WordList(wl))
#         data[index] = wl_coll
#     del tmp

    # remove via regexp c'c pattern

    # Stemming or lemmatization of tokens    
    if normalizer == 'stem':
        aa = aa.map(lambda l: map(lambda wl: map(lambda w: stemmer.stem(w) if w in wl and not isinstance(w, types.NoneType) else wl.remove(w), wl), l))
#     elif normalizer == 'lemma':
#         data[prefix] = data[prefix].map(lambda l: map(lambda wl: map(lambda w: w.lemmatize(), wl), l))
#     elif normalizer == 'None':
#         pass

    aa = aa.map(lambda l: map(lambda wl: map(Word, wl), l))
#     data[prefix] = data[prefix].map(lambda l: map(WordList, l))
    
    return aa

In [None]:
normalize_data(**tokenizer_prefs)