In [1]:
import pandas as pd
import re

In [2]:
#pip install pycld2

In [3]:
# Define a function to clean the text
def clean(text):
    # Removes all special characters and numericals leaving the alphabets
    text = text.lower()
    text = re.sub('[^A-Za-z]+', ' ', text)
    text = re.sub('httpS+s*', ' ',text)  # remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub('RT|cc', ' ', text)  # remove RT and cc
    text = re.sub('#S+', '', text)  # remove hashtags
    text = re.sub('@S+', '  ', text)  # remove mentions
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[]^_`{|}~"""), ' ', text)  # remove punctuations
    text = re.sub(r'[^\x00-\x7F]+',r' ', text) # replace non-ASCII characters
    text = re.sub('\s\s+', ' ', text)  # remove extra whitespace
    return text

In [4]:
import pycld2 as cld2
def langdetect(text):
    vectors = cld2.detect(text, returnVectors=True)
    vectors = str(vectors)
    return(vectors)

In [5]:
import nltk
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
#nltk.download('stopwords')
from nltk.corpus import stopwords
#nltk.download('wordnet')
from nltk.corpus import wordnet
import nltk
#nltk.download('averaged_perceptron_tagger')

In [6]:
# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
      if word.lower() not in set(stopwords.words('english')):
        newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

In [7]:
#import Lemmatizer
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

In [8]:
#Sentiment analysis using TextBlob
from textblob import TextBlob
# function to calculate subjectivity
def getSubjectivity(review):
    return TextBlob(review).sentiment.subjectivity
# function to calculate polarity
def getPolarity(review):
    return TextBlob(review).sentiment.polarity

# function to analyze the reviews
def analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [9]:
def bar_graph(df):
    df['Text_Blob'].value_counts().plot.bar(colormap='Pastel1')
    return plot.show()

In [10]:
import matplotlib.pyplot as plot
from wordcloud import WordCloud

def cloud_per(df):
    fig = plot.figure(figsize=(30, 25))
    for i, sentiment in enumerate(['Positive', 'Neutral', 'Negative']):
        compound_text_dif = " ".join(text for text in df[df.Text_Blob == sentiment].text)
        compound_wc_dif = WordCloud(background_color="white", max_words=100, width = 2000, height = 600).generate(compound_text_dif)
        fig.add_subplot(3, 3, i+1)
        plot.imshow(compound_wc_dif, interpolation='bilinear')
        plot.axis("off")
    return plot.show()

In [11]:
import warnings
warnings.filterwarnings('ignore')
import ipywidgets as widgets
import io
from ipywidgets import Layout
from IPython.display import display, clear_output

In [12]:
#!pip install ipyupload
#!jupyter nbextension enable --py widgetsnbextension --sys-prefix
#!jupyter serverextension enable voila --sys-prefix

In [13]:
# Image Widget

file = open("slu.png", "rb")
image = file.read()

image_headline = widgets.Image(
                    value=image,
                    format='png',
                    width='100'
                )


vbox_headline = widgets.VBox([image_headline])

In [14]:
# file
file = widgets.FileUpload(
    accept='.csv',  
    multiple=False 
)

In [15]:
lang = widgets.ToggleButtons(
            options=['English', 'Tagalog']
        )

In [16]:
# button send
from IPython.display import HTML

button_send = widgets.Button(
                description='Classify',
                tooltip='Classify',
                style={'description_width': 'initial'}
            )

output = widgets.Output()
bar = widgets.Output()
cloud = widgets.Output()

def on_button_clicked(event):
    with output:
        output.clear_output()
        input_file = list(file.value.values())[0]
        content = input_file['content']
        comments = io.StringIO(content.decode('utf-8'))
        df = pd.read_csv(comments)
        df.columns=['Text']
        tagalogdf = pd.DataFrame(columns=['text'])
        englishdf = pd.DataFrame(columns=['text'])
        df['Cleaned_Text'] = df['Text'].apply(clean)
        df.drop('Text', inplace=True, axis=1)
        display(df.style.set_properties(**{'background-color': 'lightblue',                                     
                                    'color': 'black',               
                                    'border-color': 'black'}))
        df['Lang_Detect'] = df['Cleaned_Text'].apply(langdetect)
        tagalog = df.loc[df['Lang_Detect'].str.contains("tagalog", case=False)]
        english = df.loc[df['Lang_Detect'].str.contains("english", case=False)]
        tagalogdf['text'] = tagalog['Cleaned_Text']
        englishdf['text'] = english['Cleaned_Text']
        tagalogdf['POS tagged'] = tagalogdf['text'].apply(token_stop_pos)
        englishdf['POS tagged'] = englishdf['text'].apply(token_stop_pos)
        tagalogdf['Lemma'] = tagalogdf['POS tagged'].apply(lemmatize)
        englishdf['Lemma'] = englishdf['POS tagged'].apply(lemmatize)
        #Create Data Frame
        global tgl_classified
        global eng_classified
        tgl_classified = pd.DataFrame()
        eng_classified = pd.DataFrame()
        #Tagalog
        tgl_classified = tagalogdf.copy()
        tgl_classified['Text_Blob Polarity'] = tgl_classified['Lemma'].apply(getPolarity) 
        tgl_classified['Text_Blob'] = tgl_classified['Text_Blob Polarity'].apply(analysis)
        #English
        eng_classified= englishdf.copy()
        eng_classified['Text_Blob Polarity'] = eng_classified['Lemma'].apply(getPolarity) 
        eng_classified['Text_Blob'] = eng_classified['Text_Blob Polarity'].apply(analysis)
def changed(change):
    with cloud:
        cloud.clear_output()
        if lang.value ==  'English':
            cloud_per(eng_classified)
        else:
            cloud_per(tgl_classified)
            

def changes(change):
    with bar:
        bar.clear_output()
        if lang.value ==  'English':
            bar_graph(eng_classified)
        else:
            bar_graph(tgl_classified)
            
#add output for bar graph            
button_send.on_click(on_button_clicked)
vbox_result = widgets.VBox([button_send, output])
lang.observe(changed, 'value')
lang.observe(changes, 'value')
vbox_result2 = widgets.VBox([cloud])
vbox_result3 = widgets.VBox([bar])

In [17]:
%%html
<style>
.ins_bg{
    background-color:#9CC3D5FF;
}
.box_style{
    background-color:#0063B2FF;
    margin: 0px;
    width: 100%;
    height: 100%;
}
</style>

In [18]:
# stacked right hand side

text_0 = widgets.HTML(value="<h1>Welcome to SAT comment classifier</h1>")
text_1 = widgets.HTML(value="<h2>Upload comments here </h2>")
text_2= widgets.HTML(value="<h2>Press button to classify </h2>")
text_3= widgets.HTML(value="<h2>Comment Languages </h2>")
text_4= widgets.HTML(value="<h2>Analysis</h2>")

vbox_text = widgets.VBox([text_0, text_1, file,text_2, vbox_result, text_3, lang, vbox_result2, text_4, vbox_result3])

In [19]:
box_layout = widgets.Layout(display='flex',
                flex_flow='column',
                align_items='center',
                justify_content = 'center',
                max_width = "70%",
                max_height = "80%")
page = widgets.VBox([vbox_headline, vbox_text],layout=box_layout)
page.add_class("ins_bg")
hBox = widgets.HBox([page], layout=Layout(display='flex',justify_content = 'center', width='auto', height='auto'))
hBox.add_class("box_style")
hBox.layout.width = 'auto'
display(hBox)

HBox(children=(VBox(children=(VBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\xfa…