In [1]:
import sys
sys.path.append('../helper/')

import pandas as pd
import nltk
import collections
from dataset_reader import read_tsv
# from helper.dataset_reader import read_tsv

In [2]:
#data = read_tsv('../annotation agreement/merged-tagged-annotator2.tsv')
data = read_tsv('../dataset/all-merged-data-140422.tsv')
#data = read_tsv('../dataset/test-data-140422.tsv')

In [3]:
# data returns multiple values
# the following line returns all data, all words, and all tags from the data
dt, all_words, all_tags = data

In [4]:
# Convert to dataframe [Tweet | Tags]
df_tweet_tags = pd.DataFrame(dt, columns=['Tweets','Tags'])
df_tweet_tags.head()

Unnamed: 0,Tweets,Tags
0,"[Tumben, xl, banter, go, download, ,, ,, downl...","[ID, O, JV, JV, EN, O, O, EN, ID, O, O, ID, ID..."
1,"[@myXL, @myXLCare, knp, xl, skr, jd, susah, si...","[O, O, ID, O, ID, ID, ID, EN, ID, O, ID, EN, I..."
2,"[Lak, download, nggawe, cl, iku, subuh, baru, ...","[JV, EN, JV, O, JV, ID, ID, ID, O, JV, JV, JV,..."
3,"[xl, ngebut, ., Banter, banget, ., Download, 0...","[O, ID, O, JV, JV, O, EN, O, ID, O, ID, O]"
4,"[Opone, banter, limite, mek, 512kbps, RT, @asl...","[JV, JV, MIX-JV-EN, JV, EN, EN, O, JV, O, EN, ..."


In [5]:
# Count average sentence length 

all_tweets = df_tweet_tags['Tweets']
sent_length = 0
for tweet in all_tweets:
    sent_length += len(tweet)

avg_sent_length = sent_length/len(all_tweets)
print('Average tweet length: ', round(avg_sent_length, 2), 'tokens')

Average tweet length:  24.8 tokens


In [6]:
from collections import Counter

# get unique tags from the dataset
[k for k,v in Counter(all_tags).items()]

['ID', 'O', 'JV', 'EN', 'MIX-JV-EN', 'MIX-ID-JV', 'MIX-ID-EN']

In [7]:
# create list of tuples from all words and all tags
word_tag = list(zip(all_words, all_tags))
# print(word_tag)

In [8]:
# convert list to dataframe
df_wordtag = pd.DataFrame(word_tag, columns=['Token','Label'])
df_wordtag

Unnamed: 0,Token,Label
0,Tumben,ID
1,xl,O
2,banter,JV
3,go,JV
4,download,EN
...,...,...
133785,apalagi,ID
133786,ini,ID
133787,video,EN
133788,call,EN


In [9]:
len(word_tag)

133790

In [10]:
all_tokens = df_wordtag['Token']
token_length = 0
for token in all_tokens:
    token_length += len(token)

avg_token_length = token_length/len(word_tag)
print('Average token length: ', round(avg_token_length, 2), 'characters')

Average token length:  4.57 characters


In [11]:
# Dataset info

num_token = len(df_wordtag)
num_of_unique_token = df_wordtag['Token'].nunique()
num_tweets = len(dt)

print('Number of tweets: ', num_tweets)
print('Number of tokens: ', num_token)
print('Number of unique tokens: ', num_of_unique_token)

Number of tweets:  5394
Number of tokens:  133790
Number of unique tokens:  26889


In [12]:
# print('Number of data per label')
counts = df_wordtag.Label.value_counts()
percent = df_wordtag.Label.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.DataFrame({'counts': counts, 'per': percent})

Unnamed: 0,counts,per
ID,67682,50.6%
O,30729,23.0%
EN,16082,12.0%
JV,14616,10.9%
MIX-ID-EN,2976,2.2%
MIX-ID-JV,916,0.7%
MIX-JV-EN,789,0.6%


In [13]:
import numpy as np
from helper.metrics import code_mix_index

cmi_all = []

for words, tags in data[0]:
    cmi = code_mix_index(tags, ['ID', 'JV', 'EN', 'MIX-JV-EN', 'MIX-ID-JV', 'MIX-ID-EN']) #exclude NE and O tags
    cmi_all.append(cmi)

cmi_all = np.array(cmi_all)

#Compute CMI at the corpus level by averaging the values for all sentences.
#CMI all, include 0 score in the data
#CMI mixed: only consider data with mix language, exclude tweets with 0 score
cmi = np.average(cmi_all) * 100
cmi_mixed = np.average(cmi_all[cmi_all > 0]) * 100

print('CMI: ', round(cmi, 2)) 
print('CMI Mixed: ', round(cmi_mixed, 2)) 

CMI:  24.31
CMI Mixed:  24.94


In [14]:
all_tokens = []
all_tags = []
for tokens, tags in data[0]:
    all_tokens.append(tokens)
    all_tags.append(tags)

In [15]:
# get all ngrams from data
all_ngrams = []
for taglist in all_tags:
    all_ngrams.extend(nltk.ngrams(taglist, 3))

#all_ngrams

In [16]:
counts = collections.Counter()
for taglist in all_tags:
    counts.update(nltk.ngrams(taglist, 3))

In [17]:
mocom = counts.most_common(30)

In [18]:
tag_transition_list = pd.DataFrame(mocom, columns=['Transition','Frequency'])

In [19]:
tag_transition_list

Unnamed: 0,Transition,Frequency
0,"(ID, ID, ID)",31980
1,"(O, ID, ID)",9092
2,"(ID, ID, O)",8833
3,"(ID, O, ID)",6749
4,"(EN, EN, EN)",5432
5,"(JV, JV, JV)",5153
6,"(O, O, O)",3414
7,"(ID, ID, EN)",2737
8,"(ID, O, O)",2500
9,"(O, O, ID)",2285
