In [2]:
import sys
sys.path.insert(0, 'helper')
import pandas as pd
from dataset_reader import read_tsv
import nltk
import collections

In [3]:
data = read_tsv('raw dataset/all-tagged-280322-v2.tsv')
#read_tsv('../dataset/new-tagged-1000.tsv')

In [4]:
# data returns multiple values
# the following line returns all data, all words, and all tags from the data
dt, all_words, all_tags = data

In [5]:
# Convert to dataframe [Tweet | Tags]
df_tweet_tags = pd.DataFrame(dt, columns=['Tweets','Tags'])
df_tweet_tags.head()
len(df_tweet_tags)

3223

In [6]:
# Count average sentence length 

all_tweets = df_tweet_tags['Tweets']
sent_length = 0
for tweet in all_tweets:
    sent_length += len(tweet)

avg_sent_length = sent_length/len(all_tweets)
print('Average tweet length: ', round(avg_sent_length, 2), 'tokens')

Average tweet length:  22.39 tokens


In [7]:
from collections import Counter

# get unique tags from the dataset
[k for k,v in Counter(all_tags).items()]

['ID', 'O', 'JV', 'EN', 'MIX-JV-EN', 'MIX-ID-JV', 'MIX-ID-EN']

In [8]:
# create list of tuples from all words and all tags
word_tag = list(zip(all_words, all_tags))
#print(word_tag)

In [9]:
# convert list to dataframe
df_wordtag = pd.DataFrame(word_tag, columns=['Token','Label'])
df_wordtag

Unnamed: 0,Token,Label
0,Tumben,ID
1,xl,O
2,banter,ID
3,go,JV
4,download,EN
...,...,...
72164,tiba2,ID
72165,udah,ID
72166,pagi,ID
72167,wkwkwkkw,O


In [10]:
len(word_tag)

72169

In [11]:
all_tokens = df_wordtag['Token']
token_length = 0
for token in all_tokens:
    token_length += len(token)

avg_token_length = token_length/len(word_tag)
print('Average token length: ', round(avg_token_length, 2), 'characters')

Average token length:  4.65 characters


In [12]:
# Dataset info

num_token = len(df_wordtag)
num_of_unique_token = df_wordtag['Token'].nunique()
num_tweets = len(dt)

print('Number of tokens: ', num_token)
print('Number of unique tokens: ', num_of_unique_token)
print('Number of tweets: ', num_tweets)

Number of tokens:  72169
Number of unique tokens:  17557
Number of tweets:  3223


In [13]:
print('Number of data per label')
df_wordtag.Label.value_counts()

Number of data per label


ID           38005
O            17112
JV            8918
EN            4877
MIX-ID-EN     2106
MIX-ID-JV      626
MIX-JV-EN      525
Name: Label, dtype: int64

In [14]:
import numpy as np
from metrics import code_mix_index

cmi_all = []

for words, tags in data[0]:
    cmi = code_mix_index(tags, ['ID', 'JV', 'EN', 'MIX-JV-EN', 'MIX-ID-JV', 'MIX-ID-EN']) #exclude NE and O tags
    cmi_all.append(cmi)

cmi_all = np.array(cmi_all)

#Compute CMI at the corpus level by averaging the values for all sentences.
#CMI all, include 0 score in the data
#CMI mixed: only consider data with mix language, exclude tweets with 0 score
cmi = np.average(cmi_all) * 100
cmi_mixed = np.average(cmi_all[cmi_all > 0]) * 100

print('CMI: ', round(cmi, 2)) 
print('CMI Mixed: ', round(cmi_mixed, 2)) 

CMI:  23.22
CMI Mixed:  23.75


In [15]:
all_tokens = []
all_tags = []
for tokens, tags in data[0]:
    all_tokens.append(tokens)
    all_tags.append(tags)

In [16]:
# get all ngrams from data
all_ngrams = []
for taglist in all_tags:
    all_ngrams.extend(nltk.ngrams(taglist, 3))

#all_ngrams

In [17]:
counts = collections.Counter()
for taglist in all_tags:
    counts.update(nltk.ngrams(taglist, 3))

In [18]:
mocom = counts.most_common(30)

In [19]:
tag_transition_list = pd.DataFrame(mocom, columns=['Transition','Frequency'])

In [20]:
tag_transition_list

Unnamed: 0,Transition,Frequency
0,"(ID, ID, ID)",16898
1,"(O, ID, ID)",5194
2,"(ID, ID, O)",4929
3,"(ID, O, ID)",3984
4,"(JV, JV, JV)",2801
5,"(O, O, O)",1808
6,"(ID, O, O)",1465
7,"(ID, ID, EN)",1463
8,"(O, O, ID)",1360
9,"(O, JV, JV)",1146
