In [1]:
import pandas as pd
from helper.dataset_reader import read_tsv
import nltk
import collections

In [2]:
data = read_tsv('raw dataset/all-tagged-090422-merged.tsv')
#read_tsv('../dataset/new-tagged-1000.tsv')

In [3]:
# data returns multiple values
# the following line returns all data, all words, and all tags from the data
dt, all_words, all_tags = data

In [4]:
# Convert to dataframe [Tweet | Tags]
df_tweet_tags = pd.DataFrame(dt, columns=['Tweets','Tags'])
df_tweet_tags.head()
len(df_tweet_tags)

4233

In [5]:
# Count average sentence length 

all_tweets = df_tweet_tags['Tweets']
sent_length = 0
for tweet in all_tweets:
    sent_length += len(tweet)

avg_sent_length = sent_length/len(all_tweets)
print('Average tweet length: ', round(avg_sent_length, 2), 'tokens')

Average tweet length:  22.79 tokens


In [6]:
from collections import Counter

# get unique tags from the dataset
[k for k,v in Counter(all_tags).items()]

['ID', 'NE', 'JV', 'EN', 'O', 'MIX-JV-EN', 'MIX-ID-JV', 'MIX-ID-EN']

In [7]:
# create list of tuples from all words and all tags
word_tag = list(zip(all_words, all_tags))
#print(word_tag)

In [8]:
# convert list to dataframe
df_wordtag = pd.DataFrame(word_tag, columns=['Token','Label'])
df_wordtag

Unnamed: 0,Token,Label
0,Tumben,ID
1,xl,NE
2,banter,ID
3,go,JV
4,download,EN
...,...,...
96471,apalagi,ID
96472,ini,ID
96473,video,EN
96474,call,EN


In [9]:
len(word_tag)

96476

In [10]:
all_tokens = df_wordtag['Token']
token_length = 0
for token in all_tokens:
    token_length += len(token)

avg_token_length = token_length/len(word_tag)
print('Average token length: ', round(avg_token_length, 2), 'characters')

Average token length:  4.64 characters


In [11]:
# Dataset info

num_token = len(df_wordtag)
num_of_unique_token = df_wordtag['Token'].nunique()
num_tweets = len(dt)

print('Number of tokens: ', num_token)
print('Number of unique tokens: ', num_of_unique_token)
print('Number of tweets: ', num_tweets)

Number of tokens:  96476
Number of unique tokens:  21551
Number of tweets:  4233


In [12]:
print('Number of data per label')
df_wordtag.Label.value_counts()

Number of data per label


ID           49129
O            19316
JV           13824
EN            6623
NE            3286
MIX-ID-EN     2685
MIX-ID-JV      830
MIX-JV-EN      783
Name: Label, dtype: int64

In [13]:
import numpy as np
from helper.metrics import code_mix_index

cmi_all = []

for words, tags in data[0]:
    cmi = code_mix_index(tags, ['ID', 'JV', 'EN', 'MIX-JV-EN', 'MIX-ID-JV', 'MIX-ID-EN']) #exclude NE and O tags
    cmi_all.append(cmi)

cmi_all = np.array(cmi_all)

#Compute CMI at the corpus level by averaging the values for all sentences.
#CMI all, include 0 score in the data
#CMI mixed: only consider data with mix language, exclude tweets with 0 score
cmi = np.average(cmi_all) * 100
cmi_mixed = np.average(cmi_all[cmi_all > 0]) * 100

print('CMI: ', round(cmi, 2)) 
print('CMI Mixed: ', round(cmi_mixed, 2)) 

CMI:  23.69
CMI Mixed:  24.35


In [14]:
all_tokens = []
all_tags = []
for tokens, tags in data[0]:
    all_tokens.append(tokens)
    all_tags.append(tags)

In [15]:
# get all ngrams from data
all_ngrams = []
for taglist in all_tags:
    all_ngrams.extend(nltk.ngrams(taglist, 3))

#all_ngrams

In [16]:
counts = collections.Counter()
for taglist in all_tags:
    counts.update(nltk.ngrams(taglist, 3))

In [17]:
mocom = counts.most_common(30)

In [18]:
tag_transition_list = pd.DataFrame(mocom, columns=['Transition','Frequency'])

In [19]:
tag_transition_list

Unnamed: 0,Transition,Frequency
0,"(ID, ID, ID)",22186
1,"(O, ID, ID)",5774
2,"(ID, ID, O)",5347
3,"(JV, JV, JV)",4842
4,"(ID, O, ID)",4269
5,"(ID, ID, EN)",1815
6,"(O, O, O)",1723
7,"(O, JV, JV)",1590
8,"(JV, JV, O)",1475
9,"(ID, ID, MIX-ID-EN)",1427
