In [2]:
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))

In [3]:
# nlp = spacy.load('en')
# If this doesn't work, please uncomment the following line and use a regex-based parser instead
nlp = st.whitespace_nlp_with_sentences

In [4]:
df = pd.read_csv(r"D:\UI\ANMEDSOS\Praktikum\KA\data\Visualization Relevant-Irrelevant Duplicate.csv", header=0, index_col=None, encoding='utf-8')

stop_words = ["sekolah","tatap","muka","kuliah","offline","kelas","emng",
              "bangt","aku","kalo","mau","aja","udah","jadi","yanto",
              "pas","sama","ba","on","un","Me","at","to",
              "is","sia","kaya","I","s","sla","dun","po","b","pro"
             ]

df['text'] = df['text'].astype(str)
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
df['text'] = df['text'].str.replace(pat, '')


top_words = set(stop_words)
f = lambda x: ' '.join(w for w in x.split() if not w in stop_words)
df['text'] = df['text'].apply(f)

df.text.str.replace(r'\b(\w{1,4})\b', '')
df_new = df[df['text'].notnull()]
convention_df = df_new

In [5]:
convention_df.iloc[0]

Unnamed: 0                                                    0
text          sch kata nih tahun ajar baru kalian takut gak ...
target                                                 Relevant
Name: 0, dtype: object

In [6]:
print("Document Count")
print(convention_df.groupby('target')['text'].count())
print("Word Count")
convention_df.groupby('target').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())
convention_df['parsed'] = convention_df.text.apply(nlp)

Document Count
target
Irrelevant    10063
Relevant       5041
Name: text, dtype: int64
Word Count


In [7]:
corpus = st.CorpusFromParsedDocuments(convention_df, category_col='target', parsed_col='parsed').build()

In [8]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['rel_precision'] = term_freq_df['Relevant freq'] * 1./(term_freq_df['Relevant freq'] + term_freq_df['Irrelevant freq'])
term_freq_df['rel_freq_pct'] = term_freq_df['Relevant freq'] * 1./term_freq_df['Relevant freq'].sum()
term_freq_df['rel_hmean'] = term_freq_df.apply(lambda x: (hmean([x['rel_precision'], x['rel_freq_pct']])
                                                                   if x['rel_precision'] > 0 and x['rel_freq_pct'] > 0 
                                                                   else 0), axis=1)                                                        
term_freq_df.sort_values(by='rel_hmean', ascending=False).iloc[:10]

Unnamed: 0_level_0,Relevant freq,Irrelevant freq,rel_precision,rel_freq_pct,rel_hmean
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
online,906,1240,0.422181,0.008802,0.017245
gue,593,777,0.432847,0.005761,0.011371
anak,545,672,0.447823,0.005295,0.010466
ken,521,442,0.541018,0.005062,0.01003
ajar,510,650,0.439655,0.004955,0.0098
bgt,408,494,0.452328,0.003964,0.007859
mulai,406,481,0.457723,0.003945,0.007822
lebih,385,405,0.487342,0.003741,0.007424
kangen,379,325,0.538352,0.003682,0.007314
banget,371,419,0.46962,0.003605,0.007154


In [9]:
def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())
term_freq_df['rel_precision_normcdf'] = normcdf(term_freq_df['rel_precision'])
term_freq_df['rel_freq_pct_normcdf'] = normcdf(term_freq_df['rel_freq_pct'])
term_freq_df['rel_scaled_f_score'] = hmean([term_freq_df['rel_precision_normcdf'], term_freq_df['rel_freq_pct_normcdf']])
term_freq_df.sort_values(by='rel_scaled_f_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,Relevant freq,Irrelevant freq,rel_precision,rel_freq_pct,rel_hmean,rel_precision_normcdf,rel_freq_pct_normcdf,rel_scaled_f_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mulai risiko,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355
bila kom,119,0,1.0,0.001156,0.00231,0.921885,1.0,0.959355
mati utama,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355
berat mati,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355
sakit berat,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355
risiko sakit,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355
laksana spy,84,0,1.0,0.000816,0.001631,0.921885,1.0,0.959355
lebih efekti,82,0,1.0,0.000797,0.001592,0.921885,1.0,0.959355
spy proses,84,0,1.0,0.000816,0.001631,0.921885,1.0,0.959355
risiko covid,127,0,1.0,0.001234,0.002465,0.921885,1.0,0.959355


In [10]:
term_freq_df['rel_corner_score'] = corpus.get_corner_scores('Relevant')
term_freq_df.sort_values(by='rel_corner_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,Relevant freq,Irrelevant freq,rel_precision,rel_freq_pct,rel_hmean,rel_precision_normcdf,rel_freq_pct_normcdf,rel_scaled_f_score,rel_corner_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
risiko covid,127,0,1.0,0.001234,0.002465,0.921885,1.0,0.959355,0.94604
risiko sakit,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355,0.94604
mulai risiko,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355,0.94604
sakit berat,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355,0.94604
berat mati,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355,0.94604
mati utama,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355,0.94604
utama bila,120,0,1.0,0.001166,0.002329,0.921885,1.0,0.959355,0.94604
bila kom,119,0,1.0,0.001156,0.00231,0.921885,1.0,0.959355,0.94604
ilmu ajar,84,0,1.0,0.000816,0.001631,0.921885,1.0,0.959355,0.946038
spy proses,84,0,1.0,0.000816,0.001631,0.921885,1.0,0.959355,0.946038


In [11]:
term_freq_df['irr_corner_score'] = corpus.get_corner_scores('Irrelevant')
term_freq_df.sort_values(by='irr_corner_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,Relevant freq,Irrelevant freq,rel_precision,rel_freq_pct,rel_hmean,rel_precision_normcdf,rel_freq_pct_normcdf,rel_scaled_f_score,rel_corner_score,irr_corner_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
kubur,0,1641,0.0,0.0,0.0,0.21111,0.449898,0.287373,0.100523,0.899477
dulunya,0,1641,0.0,0.0,0.0,0.21111,0.449898,0.287373,0.100523,0.899477
kubur dulunya,0,1640,0.0,0.0,0.0,0.21111,0.449898,0.287373,0.100523,0.899477
tugas tetep,0,81,0.0,0.0,0.0,0.21111,0.449898,0.287373,0.100529,0.899471
tetep banyak,0,81,0.0,0.0,0.0,0.21111,0.449898,0.287373,0.100529,0.899471
seru meski,0,79,0.0,0.0,0.0,0.21111,0.449898,0.287373,0.10053,0.89947
meski tugas,0,78,0.0,0.0,0.0,0.21111,0.449898,0.287373,0.10053,0.89947
online inget,0,78,0.0,0.0,0.0,0.21111,0.449898,0.287373,0.10053,0.89947
inget nyata,0,78,0.0,0.0,0.0,0.21111,0.449898,0.287373,0.10053,0.89947
nyata seru,0,78,0.0,0.0,0.0,0.21111,0.449898,0.287373,0.10053,0.89947


In [12]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Irrelevant Score'] = corpus.get_scaled_f_scores('Irrelevant')
term_freq_df['Relevant Score'] = corpus.get_scaled_f_scores('Relevant')
print("Top 10 Relevant terms")
pprint(list(term_freq_df.sort_values(by='Relevant Score', ascending=False).index[:10]))
print("Top 10 Irrelevant terms")
pprint(list(term_freq_df.sort_values(by='Irrelevant Score', ascending=False).index[:10]))

Top 10 Relevant terms
['risiko covid',
 'lebih efekti',
 'risiko sakit',
 'sakit berat',
 'berat mati',
 'mati utama',
 'utama bila',
 'efekti',
 'bila kom',
 'laksana spy']
Top 10 Irrelevant terms
['dulunya',
 'kubur',
 'kubur dulunya',
 'tugas tetep',
 'tetep banyak',
 'seru meski',
 'online inget',
 'inget nyata',
 'nyata seru',
 'meski tugas']


In [13]:
html = produce_scattertext_explorer(corpus,
                                    category='Relevant',
                                    category_name='Relevant',
                                    not_category_name='Irrelevant',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.scale,
                                    metadata=convention_df['text'])
file_name = 'Conventions2012ScattertextScale.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)



In [None]:
stop_words = ["sekolah","tatap","muka","kuliah","offline","kelas","emng",
              "bangt","aku","kalo","mau","aja","udah","jadi","yanto",
              "pas","sama","ba","on","un","Me","at","to",
              "is","sia","kaya","I","s","sla","dun","po","b","pro"
             ]

df['text'] = df['text'].astype(str)
pat = '|'.join(r"\b{}\b".format(x) for x in stop_words)
df['text'] = df['text'].str.replace(pat, '')


top_words = set(stop_words)
f = lambda x: ' '.join(w for w in x.split() if not w in stop_words)
df['text'] = df['text'].apply(f)