In [1]:
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))

In [2]:
# nlp = spacy.load('en')
# If this doesn't work, please uncomment the following line and use a regex-based parser instead
nlp = st.whitespace_nlp_with_sentences

In [3]:
df = pd.read_csv(r"D:\UI\ANMEDSOS\Praktikum\KA\data\Visualization Positives-Negatives Duplicate.csv", header=0, index_col=None, encoding='utf-8')

stop_words = ["sekolah","tatap","muka","kuliah","offline","kelas","emng",
              "bangt","aku","kalo","mau","aja","udah","jadi","yanto",
              "pas","sama","ba","on","un","Me","at","to",
              "is","sia","kaya","I","s","sla","dun","po","b","pro"
             ]

df['text'] = df['text'].astype(str)
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
df['text'] = df['text'].str.replace(pat, '')


top_words = set(stop_words)
f = lambda x: ' '.join(w for w in x.split() if not w in stop_words)
df['text'] = df['text'].apply(f)

df.text.str.replace(r'\b(\w{1,4})\b', '')
df_new = df[df['text'].notnull()]
convention_df = df_new

In [4]:
convention_df.iloc[0]

Unnamed: 0                                                    0
text          sch kata nih tahun ajar baru kalian takut gak ...
target                                                Negatives
Name: 0, dtype: object

In [5]:
print("Document Count")
print(convention_df.groupby('target')['text'].count())
print("Word Count")
convention_df.groupby('target').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())
convention_df['parsed'] = convention_df.text.apply(nlp)

Document Count
target
Negatives    1565
Positives    2697
Name: text, dtype: int64
Word Count


In [6]:
corpus = st.CorpusFromParsedDocuments(convention_df, category_col='target', parsed_col='parsed').build()

In [7]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['pos_precision'] = term_freq_df['Positives freq'] * 1./(term_freq_df['Positives freq'] + term_freq_df['Negatives freq'])
term_freq_df['pos_freq_pct'] = term_freq_df['Positives freq'] * 1./term_freq_df['Positives freq'].sum()
term_freq_df['pos_hmean'] = term_freq_df.apply(lambda x: (hmean([x['pos_precision'], x['pos_freq_pct']])
                                                                   if x['pos_precision'] > 0 and x['pos_freq_pct'] > 0 
                                                                   else 0), axis=1)                                                        
term_freq_df.sort_values(by='pos_hmean', ascending=False).iloc[:10]

Unnamed: 0_level_0,Negatives freq,Positives freq,pos_precision,pos_freq_pct,pos_hmean
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
online,222,377,0.629382,0.007263,0.01436
ken,71,367,0.8379,0.00707,0.014022
ajar,103,345,0.770089,0.006646,0.013179
kangen,26,329,0.926761,0.006338,0.01259
gue,238,269,0.530572,0.005182,0.010264
lebih,86,264,0.754286,0.005086,0.010104
bgt,124,244,0.663043,0.004701,0.009335
rumah,74,211,0.740351,0.004065,0.008085
banget,124,201,0.618462,0.003872,0.007696
anak,310,183,0.371197,0.003525,0.006985


In [8]:
def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())
term_freq_df['pos_precision_normcdf'] = normcdf(term_freq_df['pos_precision'])
term_freq_df['pos_freq_pct_normcdf'] = normcdf(term_freq_df['pos_freq_pct'])
term_freq_df['pos_scaled_f_score'] = hmean([term_freq_df['pos_precision_normcdf'], term_freq_df['pos_freq_pct_normcdf']])
term_freq_df.sort_values(by='pos_scaled_f_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,Negatives freq,Positives freq,pos_precision,pos_freq_pct,pos_hmean,pos_precision_normcdf,pos_freq_pct_normcdf,pos_scaled_f_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bisa cepat,0,65,1.0,0.001252,0.002501,0.802887,1.0,0.890668
lebih efekti,0,82,1.0,0.00158,0.003154,0.802887,1.0,0.890668
cepat cabut,0,65,1.0,0.001252,0.002501,0.802887,1.0,0.890668
transfer,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668
efekti,0,82,1.0,0.00158,0.003154,0.802887,1.0,0.890668
laksana spy,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668
spy proses,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668
proses transfer,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668
transfer ilmu,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668
ilmu ajar,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668


In [9]:
term_freq_df['pos_corner_score'] = corpus.get_corner_scores('Positives')
term_freq_df.sort_values(by='pos_corner_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,Negatives freq,Positives freq,pos_precision,pos_freq_pct,pos_hmean,pos_precision_normcdf,pos_freq_pct_normcdf,pos_scaled_f_score,pos_corner_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ajar lebih,0,85,1.0,0.001638,0.00327,0.802887,1.0,0.890668,0.902731
transfer ilmu,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668,0.902731
transfer,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668,0.902731
laksana spy,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668,0.902731
spy proses,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668,0.902731
proses transfer,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668,0.902731
ilmu ajar,0,84,1.0,0.001618,0.003231,0.802887,1.0,0.890668,0.902731
ajar yang,0,83,1.0,0.001599,0.003193,0.802887,1.0,0.890668,0.902731
yang ajar,0,83,1.0,0.001599,0.003193,0.802887,1.0,0.890668,0.902731
efekti,0,82,1.0,0.00158,0.003154,0.802887,1.0,0.890668,0.902731


In [10]:
term_freq_df['neg_corner_score'] = corpus.get_corner_scores('Negatives')
term_freq_df.sort_values(by='neg_corner_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,Negatives freq,Positives freq,pos_precision,pos_freq_pct,pos_hmean,pos_precision_normcdf,pos_freq_pct_normcdf,pos_scaled_f_score,pos_corner_score,neg_corner_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
risiko,283,0,0.0,0.0,0.0,0.098911,0.421641,0.160233,0.06262,0.93738
risiko covid,127,0,0.0,0.0,0.0,0.098911,0.421641,0.160233,0.06262,0.93738
utama bila,120,0,0.0,0.0,0.0,0.098911,0.421641,0.160233,0.06262,0.93738
mulai risiko,120,0,0.0,0.0,0.0,0.098911,0.421641,0.160233,0.06262,0.93738
sakit berat,120,0,0.0,0.0,0.0,0.098911,0.421641,0.160233,0.06262,0.93738
berat mati,120,0,0.0,0.0,0.0,0.098911,0.421641,0.160233,0.06262,0.93738
mati utama,120,0,0.0,0.0,0.0,0.098911,0.421641,0.160233,0.06262,0.93738
risiko sakit,120,0,0.0,0.0,0.0,0.098911,0.421641,0.160233,0.06262,0.93738
bila kom,119,0,0.0,0.0,0.0,0.098911,0.421641,0.160233,0.062621,0.937379
kom,119,0,0.0,0.0,0.0,0.098911,0.421641,0.160233,0.062621,0.937379


In [11]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Negatives Score'] = corpus.get_scaled_f_scores('Negatives')
term_freq_df['Positives Score'] = corpus.get_scaled_f_scores('Positives')
print("Top 10 Positives terms")
pprint(list(term_freq_df.sort_values(by='Positives Score', ascending=False).index[:10]))
print("Top 10 Negatives terms")
pprint(list(term_freq_df.sort_values(by='Negatives Score', ascending=False).index[:10]))

Top 10 Positives terms
['proses transfer',
 'yang ajar',
 'transfer',
 'ajar yang',
 'spy proses',
 'sezuzurnya',
 'sezuzurnya bisa',
 'bisa cepat',
 'cepat cabut',
 'cabut rumah']
Top 10 Negatives terms
['risiko sakit',
 'berat mati',
 'kom',
 'risiko',
 'mulai risiko',
 'risiko covid',
 'sakit berat',
 'bila kom',
 'mati utama',
 'utama bila']


In [12]:
html = produce_scattertext_explorer(corpus,
                                    category='Positives',
                                    category_name='Positives',
                                    not_category_name='Negatives',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.scale,
                                    metadata=convention_df['text'])
file_name = 'Conventions2012ScattertextScale.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

