In [None]:
from IPython.core.display import HTML
display(HTML(filename="./Static/Helpfunction.html"))

In [None]:
import re # import "re" function
import nltk # import nltk library

from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.probability import FreqDist
from nltk.util import ngrams
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import spacy                    #import spacy module

import en_core_web_sm
nlp = en_core_web_sm.load()

from contractions import CONTRACTION_MAP

In [None]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case: 
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  
    return ' '.join(filtered_tokens)  

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer() 
    
    return ' '.join([ps.stem(word) for word in text.split()]) 
   
def lemmatize_text(text):
    s = " "
    t_l = []
    t_w = nltk.word_tokenize(text) 
    for w in t_w:
        l_w = wordnet_lemmatizer.lemmatize(w, pos="v")
        t_l.append(l_w)
        
    return s.join(t_l)  

def expand_contractions(text):
    
    contractions_pattern = re.compile('({})'.format('|'.join(CONTRACTION_MAP.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = CONTRACTION_MAP.get(match)\
                                if CONTRACTION_MAP.get(match)\
                                else CONTRACTION_MAP.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)

    return re.sub("'", "", expanded_text)

In [None]:
def freq_ngram(document, N = 1, allgram = False): 
    
    agg_words = ' '.join([text for text in document]) 
    tok_list = agg_words.split() 
    ngram_tok_list = [' '.join(toks) for toks in ngrams(tok_list, N)]
    if allgram and N>1:
        ngram = [ngrams(tok_list, i) for i in range(1,N)]
        ngram_tok_list.extend(' '.join(toks) for ng in ngram for toks in ng)
    fdist = nltk.FreqDist(ngram_tok_list) 
    words_df = pd.DataFrame({'word':list(fdist.keys()), 
                             'count':np.array(list(fdist.values())), 
                             'frequency': np.array(list(fdist.values()))/sum(fdist.values())}) 
    return words_df

    
def viz_ngram_freq(df, figsize = (8,8)):
    plt.figure(figsize = figsize) 
    ax = sns.barplot(data=df, x= "count", y = "word") 
    ax.set(ylabel = 'Word') 
    plt.show()   

In [None]:
def create_bow_matrix(document, tfidf = True):
    
    vectorizer = TfidfVectorizer(document) if tfidf else CountVectorizer(document)
    bow_matrix = vectorizer.fit_transform(document)
    df = pd.DataFrame(bow_matrix.toarray(), columns = vectorizer.get_feature_names())
    
    return df

def get_text_similarity(doc_a, doc_b, method = "cosine"):
    
    if isinstance(doc_a, str):
        doc_a = [doc_a]
    if isinstance(doc_b, str):
        doc_b = [doc_b]
    doc = doc_a + doc_b
    bow_matrix = create_bow_matrix(doc)
    sim_matrix = cosine_similarity(bow_matrix) if method == "cosine" else euclidean_distances(bow_matrix)
    
    return pd.DataFrame(sim_matrix)
    

In [None]:
class ExtactInfo:


    def __init__(self):
        self.nlpdoc = []

    SUBJECTS_DEP = ["nsubj",  "csubj", "expl"]
    PASSIVE_SUBJ_DEP = ["nsubjpass", "csubjpass"]
    OBJECTS_DEP = ["dobj", "dative", "pobj", "oprd", ]
    CONJ_DEP = ["cc", "conj"]
    
    def start_extract(self, document):
        
        if isinstance(document, str):
            document = [document]        
        
        self.nlpdoc = list(nlp.pipe(document))

    def get_postag(self, postagtype = "univ"):
        
        if len(self.nlpdoc) == 0:
            raise ValueError(("""
            The document is empty. 
            You should use start_extract() to initiate the extracter before you use this function
            """))

        pos_tag = []
        if postagtype == "univ":
            for doc in self.nlpdoc:
                pos_tag.append(dict(Counter([tok.pos_ for tok in doc])))
        else:
            for doc in self.nlpdoc:
                pos_tag.append(dict(Counter([tok.tag_ for tok in doc])))            
        
        return pd.DataFrame(pos_tag, dtype='Int64').fillna(0)
    
    def get_noun_phrase(self):
        
        if len(self.nlpdoc) == 0:
            raise ValueError(("""
            The document is empty. 
            You should use start_extract() to initiate the extracter before you use this function
            """))

        return [list(doc.noun_chunks) for doc in self.nlpdoc] 
    