In [1]:
import re
import numpy as np
from spacy.en import English as parser_spacy

In [2]:
class Info(object):
    def __getitem__(self, items):
        return (type(items), items)
info = Info()

In [58]:
class CorpusPreprocess(object):
    
    """
    Description : PreProcessing of text into following formats
    """
    
    def __init__(self):
        """
        
        """
        self.parser_spacy = parser_spacy()
        self.intero = "What,Whose,Where,Why,How,Which,When,Who".lower().split(',')
    
    def compounding(self,text):

        """
        
        """

        text=text.decode('utf-8')
        
        results=''
        # Use Spacy Parser
        parsed = self.parser_spacy(text.decode('utf-8'))

        # Break text into sentences
        sents = [str(x) for x in list(parsed.sents)]

        for sent in sents:

            """
            Operations on each sentence. 
            """

            sent=sent.decode('utf-8').replace('-','_')
            temp_sent=''

            for x in self.parser_spacy(sent.decode('utf-8')):
                # Identifies the modifier of a noun and sticks them into one word
                if x.dep_ == 'amod':
                    temp_sent += str(x).strip()+"|"
                else:
                    temp_sent +=  str(x).strip()+" "

            sent = temp_sent
            tokens=[]
            results=''
            
            for x in self.parser_spacy(sent.decode('utf-8')):

                if x.pos_=='NOUN' or x.pos_=='PROPN':
                    # Returns nouns as tuple of the word and it's POS tag
                    tokens.append([str(x),'NN'])
                else:
                    tokens.append(" "+str(x)+" ")

            #print 'Tokens : ',tokens

            for i,token in enumerate(tokens):

                    # Combines consequtive nouns 
                    if token[1].startswith('NN'):
                        if tokens[i-1][0].lower() not in self.intero and tokens[i][0].lower() not in self.intero:
                            if tokens[i][1] == tokens[i-1][1]:
                                results+="_"+token[0]
                            else:
                                results+=" "+token[0]
                        else:
                            results+=" "+token[0]
                    else:
                        results+=" "+token

        # Returns sentence after removing Consicutive Multiple Whitespaces
        results = re.sub('\s+',' ',results.strip())
        
        return results.decode('utf-8')
    
    
    
    def pos_extensions(self,text):
        """
        
        """
        
        compounded = self.compounding(text)
        
        return [(str(x).decode('utf-8'),x.pos_) for x in self.parser_spacy(compounded)]

In [59]:
cp = CorpusPreprocess()

In [60]:
cp.compounding('Bill Gates is the richest man on Wall Street, who also does most charity!')

u'Bill_Gates is the richest|man on Wall_Street , who also does most|charity !'

In [61]:
cp.pos_extensions('Bill Gates is the richest man on Wall Street, who also does most charity!')

[(u'Bill_Gates', u'NOUN'),
 (u'is', u'VERB'),
 (u'the', u'DET'),
 (u'richest|man', u'NOUN'),
 (u'on', u'ADP'),
 (u'Wall_Street', u'PROPN'),
 (u',', u'PUNCT'),
 (u'who', u'NOUN'),
 (u'also', u'ADV'),
 (u'does', u'VERB'),
 (u'most|charity', u'NOUN'),
 (u'!', u'PUNCT')]

# REGEX

In [110]:
class RegexStudio(object):
    
    """
    Various Regex options for cleaning string
    """
    
    def __init__(self):
        #Initialize regex(s)
        self.regex_url = 'https*\S*|www.\S*.\S'
        self.regex_hashtag = '#[^\s#]+'
        self.regex_username = '@[^\s@]+'
        self.regex_alpha_only = '[A-Za-z]+'
        self.regex_between_quotes = '.*\((.*?)\)'
        self.regex_manage_spaces = '\s\s+'
        
    def between_substrings(self, text, s1='^', s2='$'):
        
        spl_chars_s1 = list(set(re.findall("[^A-Za-z0-9,\s]",s1)))
        spl_chars_s2 = list(set(re.findall("[^A-Za-z0-9,\s]",s2)))
        
        for spl_chars in spl_chars_s1:
            s1 = s1.replace(spl_chars,'\\'+spl_chars)
            
        for spl_chars in spl_chars_s2:
            s2 = s2.replace(spl_chars,'\\'+spl_chars)
        
        regex_between_two_substrings = '.*'+s1+'(.*?)'+s2
        
        return re.findall(regex_between_two_substrings,text)
    
    def clean(self,\
            text,\
            url = False,\
            hashtag = False,\
            username = False,\
            alpha_only = False,\
            alnum = True,\
            between_quotes = False,\
            manage_spaces = True,\
            lower = False
             ):
        
        if lower == True:
            text = text.lower()
        
        if url == False:
            text = re.sub(self.regex_url,' ',text)
        
        if hashtag == False:
            text = re.sub(self.regex_hashtag,' ',text)
            
        if username == False:
            text = re.sub(self.regex_username,' ',text)
            
        if alpha_only == False:
            text = " ".join(re.findall(self.regex_alpha_only,text))
         
        if alnum == False:
            text = " ".join(re.findall(self.regex_alnum,text))
        
        if between_quotes == False:
            text = re.sub(self.regex_between_quotes,' ',text)
            text.replace('( )',' ')
            
        if manage_spaces == True:
            text = re.sub(self.regex_manage_spaces,' ',text)
            
        return text

In [111]:
rs = RegexStudio()

In [115]:
text = "Bill Gates is the richest man on Wall Street 09, who also does most charity (or so he claims)!"

In [116]:
rs.clean(text=text)

'Bill Gates is the richest man on Wall Street who also does most charity or so he claims'

# String

In [120]:
class StringInfo():
    
    """
    Extracts various Informations out od text
    """
    
    def __init__(self):
        self.__regex_url__ = 'https*\S*|www.\S*.\S'
        self.__regex_hashtag__ = '#[^\s#]+'
        self.__regex_username__ = '@[^\s@]+'
        self.__regex_alpha_only__ = '[A-Za-z]+'
        self.__regex_alnum__ = '[A-Za-z0-9]+'
        self.__regex_between_quotes__ = '.*\((.*?)\)'
        self.__regex_manage_spaces__ = '\s\s+'
        self.full = {}
    
    def info(self,\
            text,\
            url = False,\
            hashtag = False,\
            username = False,\
            alpha_only = False,\
            alnum = True,\
            between_quotes = False,\
            manage_spaces = True,\
            lower = False
             ):
        self._text = text
        if lower == True:
            text = text.lower()
        
        self.url = re.findall(self.__regex_url__,text)
        self.full['url'] = re.findall(self.__regex_url__,text)
        
        self.hashtag = re.findall(self.__regex_hashtag__,text)
        self.full['hashtag'] = re.findall(self.__regex_hashtag__,text)
            
        self.username = re.findall(self.__regex_username__,text)
        self.full['username'] = re.findall(self.__regex_username__,text)
            
        self.alpha_only = " ".join(re.findall(self.__regex_alpha_only__,text))
        self.full['alpha_only'] = " ".join(re.findall(self.__regex_alpha_only__,text))
         
        self.alnum = " ".join(re.findall(self.__regex_alnum__,text))
        self.full['alnum'] = " ".join(re.findall(self.__regex_alnum__,text))
        
        self.inquotes = re.findall(self.__regex_between_quotes__,text)
        self.full['inquotes'] = re.findall(self.__regex_between_quotes__,text)
            
        return self
    
    def __str__(self):
        return self._text

In [121]:
si = StringInfo()

In [124]:
info = si.info(text)

In [133]:
info.inquotes

['or so he claims']

# Corpus

In [134]:
class CorpusFormat(object):
    
    
    def __init__(self):
        pass
    
    
    def train_test_set(self, corpus, train_ratio = 0.8):
        
        """
        Desc : Divides the input list into training and test sets after shuffling
        Input : Corpus to be divided
        Output : Tuple of Training and Test set (In that order)
        """
        
        np.random.shuffle(corpus)
        
        training_set_size = int(train_ratio  *len(corpus))
        training_set = corpus[:training_set_size]
        
        test_set = corpus[training_set_size:]
        
        return (training_set,test_set)
    
    
    def pad(self, string, pre_padding=1, post_padding=1 , lower=True, regex = "[A-Za-z0-9]+", numbers = True):
        
        """
        Desc : Divides the input list into training and test sets after shuffling
        Input : Corpus to be divided
        Output : Tuple of Training and Test set (In that order)
        """
        
        if lower == True:
            string = string.lower()
        
        string=string.strip()
        words = re.findall(regex,string)
        
        padded = words
        
        for x in range(pre_padding):
                padded = ['<start>'] + padded
        for x in range(post_padding):
                padded = padded + ['<end>']
        return padded
        
    def index(self, string, word_to_index=False, lower=True, pads=['<>'], regex = "[A-Za-z0-9]+", numbers = True):
        
        """
        Desc : Creates a dictionary containing indexes of the words
        Input : Corpus (String)
        Output : Dict[word] = respective_index
        """
        
        if lower == True:
            string = string.lower()
            
        if numbers == False:
            regex = "[A-Za-z]"
        
        for pad in pads:
            string = re.sub(pad,'',string)
        
        string = string.strip()
        
        words = re.findall(regex,string)
        
        word_index = {}
        
        unique_words = list(set(words))
        
        if word_to_index == False:
            for i,word in enumerate(unique_words):
                word_index[word] = i
        elif type(word_to_index) == dict:
            for word in unique_words:
                try:
                    word_index[word] = word_to_index[word]
                except KeyError:
                    max_len = len(word_to_index)
                    word_to_index[word] = max_len+1
                    word_index[word] = max_len+1
        else:
            raise TypeError
        
        return word_index
    
    
    def corpus_info(self, corpus, lower=True, regex = "[A-Za-z0-9]+", numbers = True, model=False):
    
        """
        Returns Following information:
        1. vocab_size
        2. word_dim
        3.  word_to_index
        4. index_to_word
        5. word_to_vector
        
        Input : string or list
        Output : (int,int,dict,list,dict)
        """
  
        
        try:
            self.__dict__['model']
        except KeyError:
            self.model = model
            
            
        flag=None
        full_corpus_info = {}
        print str(type(self.model))
        if str(type(self.model)) == "<class 'gensim.models.word2vec.Word2Vec'>":
            """
            Pre-trained gensim Word2Vec model
            """
            word_dim = list(self.model[self.model.vocab.keys()[0]].shape)[0]
            flag=0
        
        elif type(self.model) == dict:
            """
            Dict where key is word (string) and it's value is it's respective vector (numpy array)
            """
            key,value = model.iteritems()[0]
            if type(value) == np.ndarray:
                word_dim = len(value)
                flag=1
            elif type(key) == np.ndarray:
                print "Hint : reverse format of dict"
                raise TypeError
        
        elif type(self.model) == list:
            
            if type(model[0]) == tuple:
                """
                List of tuples
                Tuple of word against it's vector
                """
                if (model[0][1]) == np.ndarray:
                    word_dim = len(model[0][1])
                    del(self.model)
                    self.model = dict(model)
                    flag=2
                elif (model[0][0]) == np.ndarray:
                    print "Hint : reverse format of dict"
                    raise TypeError
            
            elif type(model[0]) == list and len(model[0]) == 2:
                """
                List of Lists
                First List is list of words (strings)
                Second List is list of vectors (numpy array)
                """
                if type(model[1][0]) == np.ndarray:
                    word_dim = len(model[1][0])
                    del(self.model)
                    self.model = dict(zip(model[0],model[1]))
                    flag=3
                elif type(model[0][0]) == np.ndarray:
                    print "Hint : reverse format of dict"
                    raise TypeError
                
        else:
            print "Hint : format of model not supported"
            raise TypeError
            
        full_corpus_info['word_dim'] = word_dim
            
        
        if type(corpus) == str:
            string = corpus
            if lower == True:
                string = string.lower()
            if numbers == False:
                regex = "[A-Za-z]"
            words = re.findall(regex,string)
        
        
        elif type(corpus) == list:
            string = ' '.join(corpus)
            if lower == True:
                string = string.lower()
            if numbers == False:
                regex = "[A-Za-z]"
            words = re.findall(regex,string)
        
        index_to_word = list(set(words))
        full_corpus_info['index_to_word'] = index_to_word

        vocab_size = len(index_to_word)
        full_corpus_info['vocab_size'] = vocab_size

        word_to_index={}
        word_to_vector = {}
        
        if model:
            for i,word in enumerate(index_to_word):
                word_to_index[word] = i
                try:
                    word_to_vector[word] = self.model[word]
                except KeyError:
                    word_to_vector[word] = np.zeros(word_dim)

            full_corpus_info['word_to_index'] = word_to_index
            full_corpus_info['index_to_word'] = index_to_word
        
        summary = 'vocab size : '+str(vocab_size)+"\n"+"dimensions of words : "+str(word_dim)
        full_corpus_info['summary'] = summary
        
        return full_corpus_info