In [1]:
import sys
sys.path.append('../helper/')

import pandas as pd
import nltk
import collections
# from dataset_reader import read_tsv
from helper.dataset_reader import read_tsv

In [2]:
data = read_tsv('../dataset/test-data-140422.tsv')

In [3]:
# data returns multiple values
# the following line returns all data, all words, and all tags from the data
dt, all_words, all_tags = data

In [12]:
data[0]

[[['Niat',
   'ngetest',
   'arus',
   ',',
   'sekarang',
   'malah',
   'bingung',
   'sendiri',
   '.'],
  ['ID', 'MIX-ID-EN', 'ID', 'O', 'ID', 'ID', 'ID', 'ID', 'O']],
 [['Kamvrettt',
   'nyoba',
   '⏰',
   'pake',
   'suara',
   'jeno',
   'malah',
   'meleyoutt',
   '🤣',
   'yg',
   'ada',
   'besok',
   'pagi',
   'bangun',
   'tidur',
   'cengar',
   'cengir',
   'mana',
   'pas',
   'ngetest',
   'full',
   'volume',
   'lagi',
   '😭'],
  ['ID',
   'ID',
   'O',
   'ID',
   'ID',
   'O',
   'ID',
   'ID',
   'O',
   'ID',
   'ID',
   'ID',
   'ID',
   'ID',
   'ID',
   'ID',
   'ID',
   'ID',
   'ID',
   'MIX-ID-EN',
   'EN',
   'ID',
   'ID',
   'O']],
 [['@nezuokochan',
   'Makanyaaa',
   'kan',
   'hhh',
   'tadi',
   'ngetest',
   'main',
   'udah',
   'ngalah',
   'jadi',
   'tank',
   'tapi',
   'sidenya',
   'malah',
   'ngefeed',
   'dan',
   'kalah',
   'pas',
   'ngecel',
   'ealaahhh',
   'jauh',
   'banget',
   'matchmaking',
   'nya',
   'sama',
   'musuh',
   '😭'

In [None]:
# Convert to dataframe [Tweet | Tags]
df_tweet_tags = pd.DataFrame(dt, columns=['Tweets','Tags'])
df_tweet_tags.head()

In [5]:
# Count average sentence length 

all_tweets = df_tweet_tags['Tweets']
sent_length = 0
for tweet in all_tweets:
    sent_length += len(tweet)

avg_sent_length = sent_length/len(all_tweets)
print('Average tweet length: ', round(avg_sent_length, 2), 'tokens')

Average tweet length:  24.07 tokens


In [6]:
from collections import Counter

# get unique tags from the dataset
[k for k,v in Counter(all_tags).items()]

['ID', 'MIX-ID-EN', 'O', 'EN', 'MIX-JV-EN', 'JV', 'MIX-ID-JV']

In [4]:
# create list of tuples from all words and all tags
word_tag = list(zip(all_words, all_tags))
# print(word_tag)



In [8]:
# convert list to dataframe
df_wordtag = pd.DataFrame(word_tag, columns=['Token','Label'])
df_wordtag

Unnamed: 0,Token,Label
0,Niat,ID
1,ngetest,MIX-ID-EN
2,arus,ID
3,",",O
4,sekarang,ID
...,...,...
24302,apalagi,ID
24303,ini,ID
24304,video,EN
24305,call,EN


In [9]:
len(word_tag)

24307

In [10]:
all_tokens = df_wordtag['Token']
token_length = 0
for token in all_tokens:
    token_length += len(token)

avg_token_length = token_length/len(word_tag)
print('Average token length: ', round(avg_token_length, 2), 'characters')

Average token length:  4.6 characters


In [11]:
# Dataset info

num_token = len(df_wordtag)
num_of_unique_token = df_wordtag['Token'].nunique()
num_tweets = len(dt)

print('Number of tweets: ', num_tweets)
print('Number of tokens: ', num_token)
print('Number of unique tokens: ', num_of_unique_token)

Number of tweets:  1010
Number of tokens:  24307
Number of unique tokens:  7641


In [12]:
print('Number of data per label')
df_wordtag.Label.value_counts()

Number of data per label


ID           11139
O             5476
JV            4906
EN            1746
MIX-ID-EN      578
MIX-JV-EN      258
MIX-ID-JV      204
Name: Label, dtype: int64

In [13]:
import numpy as np
from helper.metrics import code_mix_index

cmi_all = []

for words, tags in data[0]:
    cmi = code_mix_index(tags, ['ID', 'JV', 'EN', 'MIX-JV-EN', 'MIX-ID-JV', 'MIX-ID-EN']) #exclude NE and O tags
    cmi_all.append(cmi)

cmi_all = np.array(cmi_all)

#Compute CMI at the corpus level by averaging the values for all sentences.
#CMI all, include 0 score in the data
#CMI mixed: only consider data with mix language, exclude tweets with 0 score
cmi = np.average(cmi_all) * 100
cmi_mixed = np.average(cmi_all[cmi_all > 0]) * 100

print('CMI: ', round(cmi, 2)) 
print('CMI Mixed: ', round(cmi_mixed, 2)) 

CMI:  25.16
CMI Mixed:  26.25


In [14]:
all_tokens = []
all_tags = []
for tokens, tags in data[0]:
    all_tokens.append(tokens)
    all_tags.append(tags)

In [15]:
# get all ngrams from data
all_ngrams = []
for taglist in all_tags:
    all_ngrams.extend(nltk.ngrams(taglist, 3))

#all_ngrams

In [16]:
counts = collections.Counter()
for taglist in all_tags:
    counts.update(nltk.ngrams(taglist, 3))

In [17]:
mocom = counts.most_common(30)

In [18]:
tag_transition_list = pd.DataFrame(mocom, columns=['Transition','Frequency'])

In [19]:
tag_transition_list

Unnamed: 0,Transition,Frequency
0,"(ID, ID, ID)",5294
1,"(JV, JV, JV)",2041
2,"(O, ID, ID)",1413
3,"(ID, ID, O)",1380
4,"(ID, O, ID)",1017
5,"(O, JV, JV)",614
6,"(O, O, O)",609
7,"(JV, JV, O)",601
8,"(ID, O, O)",397
9,"(JV, O, JV)",386
