In [1]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import TweetTokenizer
import pandas as pd

In [2]:
class Text():

    def __init__(self, text):
        self.text = text.lower()
        self.tokens = TweetTokenizer().tokenize(self.text)
        self.pos_tags = nltk.pos_tag(self.tokens)
        self.total_number_of_tokens = len(self.tokens)
        self.row = {}
        self.build_row()

    def build_row(self):
        self.retrieve_gram_data()
        self.retrieve_pos_data()
        # self.retrieve_word_diversity_data()
        # self.retrieve_word_length_data()
        # self.retrieve_sentence_length_data()


    def retrieve_gram_data(self):
        for n in range(1, 3):
            n_grams = ngrams(self.tokens, n)
            number_of_n_grams = self.total_number_of_tokens - n + 1
            self.row.update( { " ".join(n_gram) : (lambda count: count / number_of_n_grams)(count) for ( n_gram, count ) in self.get_count_map(n_grams).items() } )

    def get_count_map(self, items):
        count_map = {}
        for item in items:
            if item in count_map:
                count_map[item] += 1
            else:
                count_map[item] = 1
        return count_map


    def retrieve_pos_data(self):
        for n in range(1, 5):
            pos_n_grams = []
            for i in range(0, len(self.pos_tags)+1-n):
                pos_n_grams.append( tuple([ tag for (word, tag) in  self.pos_tags[i:i+n]]) )
            self.row.update( { " ".join(pos_n_gram) : (lambda count: count / len(pos_n_grams))(count) for ( pos_n_gram, count ) in self.get_count_map(pos_n_grams).items() } )


In [3]:
text = Text(" the beautiful cat sits. the beautiful cat sits ")
text.build_row()
text.row

{'.': 0.1111111111111111,
 '. DT': 0.125,
 '. DT JJ': 0.14285714285714285,
 '. DT JJ NN': 0.16666666666666666,
 '. the': 0.125,
 'DT': 0.2222222222222222,
 'DT JJ': 0.25,
 'DT JJ NN': 0.2857142857142857,
 'DT JJ NN NNS': 0.3333333333333333,
 'JJ': 0.2222222222222222,
 'JJ NN': 0.25,
 'JJ NN NNS': 0.2857142857142857,
 'JJ NN NNS .': 0.16666666666666666,
 'NN': 0.2222222222222222,
 'NN NNS': 0.25,
 'NN NNS .': 0.14285714285714285,
 'NN NNS . DT': 0.16666666666666666,
 'NNS': 0.2222222222222222,
 'NNS .': 0.125,
 'NNS . DT': 0.14285714285714285,
 'NNS . DT JJ': 0.16666666666666666,
 'beautiful': 0.2222222222222222,
 'beautiful cat': 0.25,
 'cat': 0.2222222222222222,
 'cat sits': 0.25,
 'sits': 0.2222222222222222,
 'sits .': 0.125,
 'the': 0.2222222222222222,
 'the beautiful': 0.25}

In [4]:
# for key in text.row.keys():
#     if key not in df:
#         df[key] = text.row[key]

df1 = pd.DataFrame(text.row,index =["text1"])
df1


Unnamed: 0,.,. DT,. DT JJ,. DT JJ NN,. the,DT,DT JJ,DT JJ NN,DT JJ NN NNS,JJ,...,NNS . DT,NNS . DT JJ,beautiful,beautiful cat,cat,cat sits,sits,sits .,the,the beautiful
text1,0.111111,0.125,0.142857,0.166667,0.125,0.222222,0.25,0.285714,0.333333,0.222222,...,0.142857,0.166667,0.222222,0.25,0.222222,0.25,0.222222,0.125,0.222222,0.25


In [5]:
text = Text(" The sky is blue. the sky is blue ")
text.build_row()
text.row

{'.': 0.1111111111111111,
 '. DT': 0.125,
 '. DT NN': 0.14285714285714285,
 '. DT NN VBZ': 0.16666666666666666,
 '. the': 0.125,
 'DT': 0.2222222222222222,
 'DT NN': 0.25,
 'DT NN VBZ': 0.2857142857142857,
 'DT NN VBZ JJ': 0.3333333333333333,
 'JJ': 0.2222222222222222,
 'JJ .': 0.125,
 'JJ . DT': 0.14285714285714285,
 'JJ . DT NN': 0.16666666666666666,
 'NN': 0.2222222222222222,
 'NN VBZ': 0.25,
 'NN VBZ JJ': 0.2857142857142857,
 'NN VBZ JJ .': 0.16666666666666666,
 'VBZ': 0.2222222222222222,
 'VBZ JJ': 0.25,
 'VBZ JJ .': 0.14285714285714285,
 'VBZ JJ . DT': 0.16666666666666666,
 'blue': 0.2222222222222222,
 'blue .': 0.125,
 'is': 0.2222222222222222,
 'is blue': 0.25,
 'sky': 0.2222222222222222,
 'sky is': 0.25,
 'the': 0.2222222222222222,
 'the sky': 0.25}

In [6]:
df2 = pd.DataFrame(text.row,index =["text2"])
df2

Unnamed: 0,.,. DT,. DT NN,. DT NN VBZ,. the,DT,DT NN,DT NN VBZ,DT NN VBZ JJ,JJ,...,VBZ JJ .,VBZ JJ . DT,blue,blue .,is,is blue,sky,sky is,the,the sky
text2,0.111111,0.125,0.142857,0.166667,0.125,0.222222,0.25,0.285714,0.333333,0.222222,...,0.142857,0.166667,0.222222,0.125,0.222222,0.25,0.222222,0.25,0.222222,0.25


In [7]:
df1 = df1.append(df2)
df1

Unnamed: 0,.,. DT,. DT JJ,. DT JJ NN,. DT NN,. DT NN VBZ,. the,DT,DT JJ,DT JJ NN,...,cat sits,is,is blue,sits,sits .,sky,sky is,the,the beautiful,the sky
text1,0.111111,0.125,0.142857,0.166667,,,0.125,0.222222,0.25,0.285714,...,0.25,,,0.222222,0.125,,,0.222222,0.25,
text2,0.111111,0.125,,,0.142857,0.166667,0.125,0.222222,,,...,,0.222222,0.25,,,0.222222,0.25,0.222222,,0.25


In [8]:
df3 = pd.DataFrame()
df3.append(df1)

Unnamed: 0,.,. DT,. DT JJ,. DT JJ NN,. DT NN,. DT NN VBZ,. the,DT,DT JJ,DT JJ NN,...,cat sits,is,is blue,sits,sits .,sky,sky is,the,the beautiful,the sky
text1,0.111111,0.125,0.142857,0.166667,,,0.125,0.222222,0.25,0.285714,...,0.25,,,0.222222,0.125,,,0.222222,0.25,
text2,0.111111,0.125,,,0.142857,0.166667,0.125,0.222222,,,...,,0.222222,0.25,,,0.222222,0.25,0.222222,,0.25
