In [25]:
# Import miscellaneous operating system interfaces
import os
# library for regular expression
import re
# Import RegexpTokenizer (比較聰明的抓字)
from nltk.tokenize import RegexpTokenizer
# Import PorterStemmer
from nltk.stem import PorterStemmer
# Extracting from a text a list of n-gram can be easily accomplished with function ngram()
from nltk.util import ngrams
# NLTK provides a built-in function FreqDist to compute this distribution directly from a set of word tokens.
from nltk.probability import *

import nltk
nltk.download('stopwords')
from nltk.util import bigrams
from nltk import BigramCollocationFinder

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eileen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
# Dataset for me(29286875) copied from moodle
myDataset = '262 76 778 565 6 623 200 742 645 418 520 814 467 680 216 12 762 63 \
276 66 236 722 82 779 626 206 276 649 750 20 167 738 602 6 251 808 \
729 115 80 71 539 38 426 44 556 779 251 760 61 728 251 268 421 750 \
832 807 55 452 683 303 560 555 547 265 649 441 633 108 517 744 137 \
446 182 397 442 8 266 630 857 17 498 32 293 711 413 445 392 606 322 \
422 54 582 657 239 309 382 286 105 336 786 788 712 738 516 29 523 \
51 428 43 47 819 649 598 297 615 208 865 720 122 789 372 860 813 \
219 700 275 4 466 455 277 427 728 71 812 284 646 327 397 262 22 \
450 80 377 26 332 45 325 24 274 153 847 311 168 99 361 362 139 602 \
551 517 111 812 1 364 467 323 398 207 837 716 733 685 498 817 336 \
395 154 265 492 117 389 833 705 703 270 218 435 475 338 513 621 \
208 839 560 370 81 445 89 12 645 336 157 47 91 32 797 837 803 195 \
861 237 31 810 634 211 246 278 788 556 726 684 815 460 596 237 640 \
764 251 304 474 154 384 743 749 593 327 749 515 543 310 833 464 724 \
66 331 649 397 426 383 492'

# Create an empty list
myDataset_ls = []

# Convert string to list
myDataset_ls = myDataset.split(' ')

# Format filenames assign to cv_name_ls
cv_name_ls = ['resume_(' + s + ').txt' for s in myDataset_ls]

In [67]:
# Extract bigrams
def generate_collocations(tokens):
    '''
    Given list of tokens, return collocations.
    '''

    ignored_words = nltk.corpus.stopwords.words('english')
    bigram_measures = nltk.collocations.BigramAssocMeasures()

    # Best results with window_size, freq_filter of: (2,1) (2,2) (5,1)
    finder = BigramCollocationFinder.from_words(tokens, window_size = 2)
    finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
    finder.apply_freq_filter(1)

    colls = finder.nbest(bigram_measures.likelihood_ratio, 20)

    return colls 

In [96]:
def analyze(filename, each_resume, stopword):
    # -------------抓字
    # Remove whitespace and join into a string
    clean_resume = ' '.join(each_resume.split())
    
    # Set regex for each token
    token_regex = r"\w+(?:[-']\w+)?"

    # Find the matched token in clean_string
    match_token = re.findall(token_regex, clean_resume)
    
    # -------------轉成小寫，除了中間的capital word
    # Get capital tokens in the middle of the sentences by using regex
    capital_tokens_regex = r'(?<!\.\s)(\b[A-Z](?:[A-Z]*|[a-z]*)\b|\b[A-Z][a-z]+[A-Z][a-z]+\b)'
    
    # Find all matched tokens in clean_string
    match_capital = re.findall(capital_tokens_regex, clean_resume)
    
    # Filter two lists to get the same values
    vocab = list(set(match_token) & set(match_capital))

    # -------------移除stopword
    # Create an empty list to store stopwords
    stopwords_list = []

    # Open and read the stopword file, then add into stopwords_list
    with open(stopword) as f:
        stopwords_list = f.read().splitlines()
    
    # Create a tokens list without stopwords
    filtered_tokens = [token for token in vocab if token not in stopwords_list]

    # -------------stem 轉換回根源
    stemmer = PorterStemmer()
    # Stem tokens using the Porter stemmer
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # -------------移除rare 2%
    # Create a function for getting the percentage of each token frequency
    def percentage(each, total):
        return 100 * (each / total)

    # Get distinct tokens remaining filtered from filtered_tokens
    lexical_diversity = set(stemmed_tokens)

    # Create an empty list to store tokens without rare (2%) ones
    often_tokens = []
    
    # Check each distinct token
    for each in lexical_diversity: 
        # Calculate the percentage of each token frequency
        token_ratio = percentage(stemmed_tokens.count(each), len(vocab))
        # if the token frequency(%) > 2%
        if token_ratio > 0.02:
            # Append into often_tokens
            often_tokens.append(each)
    # Sort often_tokens excluden
    sorted_often_tokens = sorted(often_tokens)
    
    # -------------移除三個字母以下的字
    # Create an empty list
    clean_uni_tokens = []

    # Check each token in sorted_often_tokens
    for each in sorted_often_tokens:
        # If the length of each token >= 3
        if len(each) >= 3:
            # Append that token into token_3more_len_list
            clean_uni_tokens.append(each)
            
    # -------------前200個有意義的bigrams
    # Get the bigrams list like [('I', 'am'), ('good', 'day'), ...]
    bigram_ls = generate_collocations(match_token)
    
    # Create an empty list
    split_ls = []
    # Get all values from bigram_ls and store into split_ls
    for bigram in range(len(bigram_ls)):
        for word in range(len(bigram_ls[bigram])):
            split_ls.append(bigram_ls[bigram][word])
    
    # Create an empty list
    best200 = []
    # Make it look like [('I am'), ('good day'), ...]
    for each in range(0, len(split_ls), 2):
        best200.append(split_ls[each]+" "+split_ls[each+1])
    
#     return print(best200)
    
    
    
    # -------------vocab (token_string:integer_index)
    final_tokens = clean_uni_tokens + best200
    
    vocab_str = ''
    for integer_index, token_string in enumerate(final_tokens, 1):
        vocab_str += token_string + ": " + str(integer_index) + "\n"
        
#     # Write into a file
#     f = open('29286875_vocab.txt','a+')
#     f.write("\n-----------\n" + vocab_str + "\n")
#     f.close()    
    
    # 剩這裡了
    # -------------countVec (file_name, token_index:count, token_index:count,...)
    uni_count = []
    bi_count = []
    
    unigram_diversity = set(match_token)
    for uni in unigram_diversity:
        uni_count.append(match_token.count(uni))
        
    
#     bigram_diversity = set(best200)
#     for bi in bigram_diversity:
#         bi_count.append(match_token.count(bi))
        
#     count_tokens = []
    
    return print(uni_count)
    
#     # Write into a file
#     f = open('29286875_countVec.txt','a+')
#     f.write(vocab_str + "\n")
#     f.close() 
    

In [97]:
# Import miscellaneous operating system interfaces
import os

# Create an empty string
per_CV = ''
my_CVs = ''

# Return a list containing the names of the entries in the directory given by path
# os.listdir(path of resumeTxt)
for file in os.listdir("/Users/eileen/Jupyter/5196 data wrangling/Assignments/A1/PDF"):
    # If the file is in cv_name_ls
    if file in cv_name_ls:
        # Open and read that file from the chosen path
        with open(file, 'r') as f:
            # Read line by line
            for line in f.readlines():
                # Add into my_CVs
                per_CV += line
            # Function for analyzing
            analyze(file, per_CV, 'stopwords_en.txt')

# Write into a file (test)
# f = open('29286875_CVs.txt','a+')
# f.write(my_CVs)
# f.close()

[1, 1, 1, 1, 1, 2, 1, 1, 9, 5, 2, 1, 1, 1, 3, 1, 1, 1, 34, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 6, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 4, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 5, 2, 2, 2, 2, 1, 3, 4, 2, 3, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 6, 1, 4, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 8, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 3, 1, 1, 5, 2, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 11, 2, 2, 2, 1, 1, 2, 1, 1, 2, 1, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 6, 1, 1, 1, 1, 1, 1, 2, 3, 1, 5]
[1, 1, 1, 5, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 4, 1, 1, 3, 5, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 