In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from wordcloud import WordCloud
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt


def get_freq_words(df, label, word_no=50, adj=False):
    '''
    Signature:  get_freq_words(df=None, label=none,word_no=50,adj=False)
    Docstring:  Return list of tuple containing words and its frequency.
    Parameter:  df: pandas dataframe,
                label: int, 
                word_no: int, total number of most used words to return
                adj=boolean
    '''
    # concat review texts as one string
    txt = (df[df['label'] == label]['review']).str.cat(sep= ' ').lower()
    # create token from all positive review
    doc = nlp(txt)
    
    # filter out stop-words and punct.
    if adj:
        words = [token.text for token in doc if token.is_stop != True
                 and token.is_punct != True and token.pos_ == 'ADJ']
    else:
        words = [token.text for token in doc if token.is_stop != True
                 and token.is_punct != True]
    
    # calculate word frequency
    word_freq = Counter(words)
    common_words = word_freq.most_common(word_no)
    return common_words


def plot_wcloud(lst, spt1, spt2):
    '''
    Signature:  plot_wcloud(lst=None, spt1=None, spt2=None)
    Docstring:  Return a plotly express figure of word cloud image.
    Parameter:  lst: list of tuple of word and count. 
                spt1: string, subplot one title.
                spt2: string, subplot two title.
    '''
    # create figure with 2 subplot
    fig = make_subplots(
        rows=2, cols=1,
        vertical_spacing=0.1,
        subplot_titles=(f'Top 50 {spt1} Words',
                        f'Top 50 {spt2} Words'))

    for i, j  in enumerate(lst):
        # create WordCoud object
        wc = WordCloud(background_color="white")
        # generate word cloud
        wc.generate_from_frequencies(dict(j))
        
        fig.add_trace(go.Image(z=wc),row=i+1, col=1)
        
    # set subplot title font size
    for annotation in fig['layout']['annotations']: 
        annotation['font']={'size':24}
    
    fig.update_layout(width=550, height=700, hovermode=False, autosize=False)
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    fig.show(scale=10)
    

def eval_model(label, pred, df=None):
    '''
    Signature:  eval_model(df=None, label=None, pred=None)
    Docstring:  Return df containing classification report and plotly figure
                , confusion matrix.
    Parameter: df: dataframe
               label: list/arrary, the label column
               pred: list/array, model prediction
    '''
    # print classificatoin report
    display(pd.DataFrame.from_dict(classification_report(
        label, pred, output_dict=True)).T)
    
    # plot confusion matrix
    z = np.round(confusion_matrix(label, pred, normalize='true'), 3)
    x = ['neg pred', 'pos pred']
    y = ['neg actual', 'pos actual']

    # set up figure 
    fig = ff.create_annotated_heatmap(z, x=x, y=y, colorscale='Blues')

    # set font size of z values
    for i in range(len(fig.layout.annotations)):
        fig.layout.annotations[i].font.size = 16

    fig.update_layout(title_text='<i><b>Confusion matrix</b></i>',
                      height= 500, width=500)

    # move xaxis label to bottom
    fig.layout.xaxis.update(side='bottom')
    # add custom xaxis title
    fig.add_annotation(dict(font=dict(color="black",size=16),
                        x=0.5,
                        y=-.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper",
                       ))

    # add custom yaxis title
    fig.add_annotation(dict(font=dict(color="black",size=16),
                        x=-0.15,
                        y=0.5,
                        showarrow=False,
                        text="Actual value",
                        textangle=-90,
                        xref="paper",
                        yref="paper"
                       ))
    
    fig.show()
    

def rem_stpwrd(txt):
    '''
    Signature:   rem_stpwrd(txt=None)
    Docstring:   Return string with stop words and punct removed
    Parameter:   txt: str
    '''
    doc = nlp(txt.lower())
    words = ''
    for token in doc:
        if token.is_stop != True and token.is_punct != True:
            words += token.text + ' '

    return words    