In [None]:
 %run Import_Library.ipynb

In [100]:
def avg_digits_text(data):
    """
    Purpose:
        - Count the number of digits in each doc in data
        - Compute the average 
    
    Arg:
        data: array of text
        
    Return: 
        an average of number of digits in data
    """
    avg = []
    num_re = re.compile('[0-9]')
    for text in data:
        match = num_re.findall(text)
        if(len(match) > 0):
            avg.append(len(match))
            
    return np.mean(avg)


def add_feature(X, feature_to_add):
    """
    Combine new features into the sparse matrix
    
    Return:
        sparse feature matrix with added feature.

    Arg:
     - X: sparse feature matrix, for example:
         array([[   0.,    0.,    0., ...,    0.,    0.,   31.],
           [   0.,    0.,    0., ...,    0.,    0.,  130.],
           [   0.,    0.,    0., ...,    0.,    0.,   66.],
           ..., 
           [   0.,    0.,    0., ...,    0.,    0.,  147.],
           [   0.,    0.,    0., ...,    0.,    0.,   62.],
           [   0.,    0.,    0., ...,    0.,    0.,   82.]])
       
     - feature_to_add: list of features, for example:
         [[ 31, 130,  66, ..., 147,  62,  82]]

    """

    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [101]:
# remove alpha numerical words and make lowercase
def remove_non_alphameric_character(text):
    non_alpha_char_re = re.compile('[^a-zA-Z0-9 ]')
    return non_alpha_char_re.sub('', text.strip().lower())

'''
# make a test
text = 'remoVe $ £ # characters _'
remove_non_alphameric_character(text)
'''

'remove    characters '

In [102]:
# remove punctuation
def remove_punctuation(text):
    punc_re = re.compile('[%s]' % re.escape(string.punctuation))
    return punc_re.sub('', text)

'''
# make a test
my_string = "Hello!!!, This is ##STechies$$."
punc_re = re.compile('[%s]' % re.escape(string.punctuation))
print("Punctuation characters found:", punc_re.findall(my_string), "in the text:", my_string)
remove_punctuation(my_string)
'''

Punctuation characters found: ['!', '!', '!', ',', '#', '#', '$', '$', '.'] in the text: Hello!!!, This is ##STechies$$.


'Hello This is STechies'

In [103]:
# remove stop words
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    return ' '.join([word for word in word_tokenize(text) if word not in stop_words])

'''
# make a test
text = "How many are there stop words in this sentence ? "
remove_stopwords(text)

'''

'How many stop words sentence ?'

In [104]:
def get_wordnet_pos(treebank_tag):
    """
    Convert the part-of-speech naming scheme
    from the nltk default to that which is
    recognized by the WordNet lemmatizer
    
    """

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [108]:
def lemmatizer(text):
    
    lemmatizer = WordNetLemmatizer()
    # Tokenize the text and mark the part of speed (pos) tagging for each token
    word_pos = nltk.pos_tag(word_tokenize(text))
    # Get the part-of-speech of each token in text
    pos_tags = [get_wordnet_pos(pair[1]) for pair in word_pos]
    # get the lemma of each token according to their position tag
    return ' '.join([lemmatizer.lemmatize(pair[0], pair[1]) for pair in zip(word_tokenize(text), pos_tags)])

'''
# make a test
text = "Where multiple candidates for the LCS exist, that whose shortest path to the root node is the longest will be selected "
print("Before the lemmatization:", text)
print("After the lemmatization:", lemmatizer(text))

'''


Before the lemmatization: Where multiple candidates for the LCS exist, that whose shortest path to the root node is the longest will be selected 
After the lemmatization: Where multiple candidate for the LCS exist , that whose short path to the root node be the long will be select


In [None]:
def preprocessing(text):
    text = remove_non_alphameric_character(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = lemmatizer(text)
    return text

# make a test
# make a test
text = "Where multiple candidates for the LCS exist, that whose shortest path to the root node is the longest will be selected "
print("Before preprocessing():", text)
print("After preprocessing():", preprocessing(text))

In [112]:
def word_frequency(text):
    '''
    Count the frequency of each word in given text (ignore case)
    Return a dictionary whose key is token and its frequency
    '''
    
    word_freq = {}
    text = text.lower()
    for word in word_tokenize(text):
        if word in word_freq:
            word_freq[word] =  word_freq[word] + 1
        else:
            word_freq[word] = 1
    return word_freq

'''

# Make a test
text = "In this tutorial, you will learn about Nltk FreqDist function with example. This function is used to find the frequency of words within a text."
word_frequency(text.lower())
'''

'\n\n# Make a test\ntext = "In this tutorial, you will learn about Nltk FreqDist function with example. This function is used to find the frequency of words within a text."\nword_frequency(text.lower())\n'

In [122]:
def top_word_frequency(text, n=10):
    
    word_freq = word_frequency(text)
    # sort the dictionary with descending by setting reverse=False
    # because by default, sorted() will sort the dict() with ascending order
    return sorted( word_freq.items(), key = lambda item : item[1], reverse = True)[:n]

'''
# Make a test
text = "In this tutorial, you will learn about Nltk FreqDist function with example. This function is used to find the frequency of words within a text."
top_word_frequency(text, 20)
'''

[('this', 2),
 ('function', 2),
 ('.', 2),
 ('in', 1),
 ('tutorial', 1),
 (',', 1),
 ('you', 1),
 ('will', 1),
 ('learn', 1),
 ('about', 1),
 ('nltk', 1),
 ('freqdist', 1),
 ('with', 1),
 ('example', 1),
 ('is', 1),
 ('used', 1),
 ('to', 1),
 ('find', 1),
 ('the', 1),
 ('frequency', 1)]

In [7]:
import random

def myshuffle(seed, data):
    random.seed(seed)
    random.shuffle(data)

# Make a test

X = [0.99103943, 0.98566308, 0.99641588, 0.99641577, 0.99103945, 0.98563734]

Y = [1, 2, 3, 4, 5, 6]

myshuffle(123, X)

myshuffle(123, Y)

print(X)

print(Y)

[0.99641577, 0.99103945, 0.98566308, 0.98563734, 0.99641588, 0.99103943]
[4, 5, 2, 6, 3, 1]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000
# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'
# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.
    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.
    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)
    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)
    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k = min(TOP_K, x_train.shape[1]))
    
    selector.fit(x_train, train_labels)
    
    x_train = selector.transform(x_train).astype('float32')
    
    x_val = selector.transform(x_val).astype('float32')
    
    return x_train, x_val

In [None]:
print("Import Accessory_Functions.ipynb : DONE")