In [86]:
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer
import gensim
from collections import  Counter
import string
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize
import pyLDAvis
from wordcloud import WordCloud, STOPWORDS
from textblob import TextBlob
from spacy import displacy
import nltk
from textblob import TextBlob
from textstat import flesch_reading_ease

plt.rcParams.update({'font.size': 18})
plt.rcParams.update({'figure.figsize': [16, 12]})
plt.style.use('seaborn-whitegrid')


# In[ ]:


class vizzy_sentence:
    def __init__(self, data, column, label):
        from nltk.corpus import stopwords
        stopwords = set(stopwords.words('english'))
        data[column] = data[column].apply(lambda x: x.lower())
        data['text_without_stopwords'] = data[column].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stopwords]))
        self.data = data
        self.column = column
        
        
    def show_char_count(self):
        '''Histogram of the length of your data column in characters'''
        char_plot = self.data[self.column].str.len().hist()
        return char_plot
    
    def show_word_count(self):
        '''Histogram of the length of data column in words'''
        count_plot = self.data[self.column].str.split().map(lambda x: len(x)).hist()
        return count_plot
    
    def show_word_length(self):
        '''Hist of length of words in data column in characters'''
        len_plot = self.data[self.column].str.split().apply(lambda x : [len(i) for i in x]).map(lambda x: np.mean(x)).hist()
        return len_plot
    
    def show_common_stopwords(self):
        '''List of common stopwords in data'''
        stop=set(STOPWORDS)
        corpus=[]
        new= self.data[self.column].str.split()
        new=new.values.tolist()
        corpus=[word for i in new for word in i]

        from collections import defaultdict
        dic=defaultdict(int)
        for word in corpus:
            if word in STOPWORDS:
                dic[word]+=1
        top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
        x,y=zip(*top)
        plot = plt.bar(x,y)
        return plot
    
    def show_common_words(self):
        '''Common words in data'''
        corpus=[]
        new=self.data[self.column].str.split()
        new=new.values.tolist()
        corpus=[word for i in new for word in i]
        counter=Counter(corpus)
        most=counter.most_common()
        x, y=[], []
        for word,count in most[:40]:
            try:
                if (word not in STOPWORDS):
                    x.append(word)
                    y.append(count)
            except:
                x.append(word)
                y.append(count)
        sns.barplot(x=y,y=x)
        
    def show_sentiment(self):
        '''Sentiment in data'''
        text = self.data[self.column]
        def polarity(text):
            return TextBlob(text).sentiment.polarity
        self.data['polarity_score']=self.data[self.column].apply(lambda x : polarity(x))
        hist = self.data['polarity_score'].hist()
        return hist

    def show_sentiment_cats(self):
        '''Plot data by sentiment (pos, neu, neg)'''
        def polarity(text):
            return TextBlob(text).sentiment.polarity
        self.data['polarity_score']=self.data[self.column].apply(lambda x : polarity(x))
        def sentiment(x):
            if x<0:
                return 'neg'
            elif x==0:
                return 'neu'
            else:
                return 'pos'
        self.data['polarity']=self.data['polarity_score'].map(lambda x: sentiment(x))
        plot = plt.bar(self.data.polarity.value_counts().index, self.data.polarity.value_counts())
        return plot
    
    def show_neg_sentiment(self):
        '''Show negative sentiment'''
        def polarity(text):
            return TextBlob(text).sentiment.polarity
        self.data['polarity_score']=self.data[self.column].apply(lambda x : polarity(x))
        def sentiment(x):
            if x<0:
                return 'neg'
            elif x==0:
                return 'neu'
            else:
                return 'pos'
        self.data['polarity']=self.data['polarity_score'].map(lambda x: sentiment(x))
        results = self.data[self.data['polarity']=='neg'][self.column].head(5)
        return results
    
    def show_pos_sentiment(self):
        '''Show positive sentiment'''
        def polarity(text):
            return TextBlob(text).sentiment.polarity
        self.data['polarity_score']=self.data[self.column].apply(lambda x : polarity(x))
        def sentiment(x):
            if x<0:
                return 'neg'
            elif x==0:
                return 'neu'
            else:
                return 'pos'
        self.data['polarity']=self.data['polarity_score'].map(lambda x: sentiment(x))
        results = self.data[self.data['polarity']=='pos'][self.column].head(5)
        return results
    
    def show_flesch_kincaid(self):
        '''show flesch kincaid score'''
        hist = self.data[self.column].apply(lambda x : flesch_reading_ease(x)).hist()
        return hist
    
    def show_bi_grams(self):
        '''show most common bi-grams'''
        corpus=[]
        new=self.data[self.column].str.split()
        new=new.values.tolist()
        corpus=[word for i in new for word in i]
        def get_top_ngram(corpus, n=None):
            vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
            bag_of_words = vec.transform(corpus)
            sum_words = bag_of_words.sum(axis=0) 
            words_freq = [(word, sum_words[0, idx]) 
                          for word, idx in vec.vocabulary_.items()]
            words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
            return words_freq[:10]

        top_n_bigrams=get_top_ngram(self.data[self.column],2)[:10]
        x,y=map(list,zip(*top_n_bigrams))
        plot = sns.barplot(x=y,y=x)
        return plot
        
    def show_tri_grams(self):
        '''show most common tri-grams'''
        corpus=[]
        new=self.data[self.column].str.split()
        new=new.values.tolist()
        corpus=[word for i in new for word in i]
        def get_top_ngram(corpus, n=None):
            vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
            bag_of_words = vec.transform(corpus)
            sum_words = bag_of_words.sum(axis=0) 
            words_freq = [(word, sum_words[0, idx]) 
                          for word, idx in vec.vocabulary_.items()]
            words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
            return words_freq[:10]

        top_n_trigrams=get_top_ngram(self.data[self.column],3)[:10]
        x,y=map(list,zip(*top_n_trigrams))
        plot = sns.barplot(x=y,y=x)
        return plot

In [87]:
class vizzy_token:
    def __init__(self, data, column):
        self.data = data
        self.column = column
        
    def show_labels_count(self):
        '''show count of each label'''
        labels = self.data[self.column]
        counter = Counter(labels)
        x = list(counter.keys())
        y = list(counter.values())
        plot = sns.barplot(x=y,y=x)
        return plot
    
    def print_labels_count(self):
        '''print count of each label'''
        labels = self.data[self.column]
        counter = Counter(labels)
        x = list(counter.keys())
        y = list(counter.values())
        z = zip(x,y)
        for label, count in z:
            print("Total number of {}: {}".format(label, count))

In [88]:
class vizzy_doc:
    def __init__(self, data, column1, column2=None, column3=None, column4=None, column5=None):
        self.data = data
        self.column1 = column1
        self.column2 = column2
        self.column3 = column3
        self.column4 = column4
        self.column5 = column5
        
    def print_doc_stats(self):
        '''Print the statistics of your document'''
        def counter(data, column):
            return data[column].nunique()
            
        docs = max(idx for idx, other in self.data.iterrows())
        print("Here is your data summary:")
        print("\n")
        print("Total number of documents: {}".format(docs))
        print("Total number of {}: {}".format(str(self.column1), (counter(self.data, self.column1))))
        if self.column2 != None:
            print("Total number of {}: {}".format(str(self.column2), (counter(self.data, self.column2))))
        else:
            pass
        if self.column3 != None:
             print("Total number of {}: {}".format(str(self.column3), (counter(self.data, self.column3))))
        else:
            pass
        if self.column4 != None:
             print("Total number of {}: {}".format(str(self.column4), (counter(self.data, self.column4))))
        else:
            pass
        if self.column5 != None:
             print("Total number of {}: {}".format(str(self.column5), (counter(self.data, self.column5))))
        else:
            pass
    
    

In [89]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

text = "SpaCy is a popular library for NLP in Python. Another popular library is NLTK"
doc = nlp(text)

data = []
for token in doc:
    data.append({"Token": token.text, "Lemma": token.lemma_, "POS": token.pos_, "ENT": token.ent_type_})

df = pd.DataFrame(data)

In [90]:
viz = vizzy_token(df, 'POS')

In [91]:
viz.print_labels_count()

Total number of PROPN: 4
Total number of AUX: 2
Total number of DET: 2
Total number of ADJ: 2
Total number of NOUN: 2
Total number of ADP: 2
Total number of PUNCT: 1


In [113]:
!pip install vizzy==0.1.1

Collecting vizzy==0.1.1
  Downloading vizzy-0.1.1-py3-none-any.whl (3.8 kB)
Installing collected packages: vizzy
  Attempting uninstall: vizzy
    Found existing installation: vizzy 0.1.0
    Uninstalling vizzy-0.1.0:


      Successfully uninstalled vizzy-0.1.0
Successfully installed vizzy-0.1.1


In [116]:
from vizzy import vizzy

In [121]:
viz = vizzy.vizzy_token(df, 'POS')

In [122]:
viz.print_labels_count()

Total number of propn: 4
Total number of aux: 2
Total number of det: 2
Total number of adj: 2
Total number of noun: 2
Total number of adp: 2
Total number of punct: 1
