In [1]:
# import sys
# sys.path.append('../helper/')

import pandas as pd
import nltk
import collections
from helper import cmi_calculation, dataset_reader

In [2]:
filename = '../dataset/comlid-data-140422-v1.tsv'
data = dataset_reader.read_tsv(filename)

In [3]:
# data returns multiple values
# the following line returns all data, all words, and all tags from the data
dt, all_words, all_tags = data

In [4]:
# Convert to dataframe [Tweet | Tags]
df_tweet_tags = pd.DataFrame(dt, columns=['Tweets','Tags'])
df_tweet_tags.head()

Unnamed: 0,Tweets,Tags
0,"[Tumben, xl, banter, go, download, ,, ,, downl...","[ID, O, JV, JV, EN, O, O, EN, ID, O, O, ID, ID..."
1,"[@myXL, @myXLCare, knp, xl, skr, jd, susah, si...","[O, O, ID, O, ID, ID, ID, EN, ID, O, ID, EN, I..."
2,"[Lak, download, nggawe, cl, iku, subuh, baru, ...","[JV, EN, JV, O, JV, ID, ID, ID, O, JV, JV, JV,..."
3,"[xl, ngebut, ., Banter, banget, ., Download, 0...","[O, ID, O, JV, JV, O, EN, O, ID, O, ID, O]"
4,"[Opone, banter, limite, mek, 512kbps, RT, @asl...","[JV, JV, MIX-JV-EN, JV, EN, EN, O, JV, O, EN, ..."


In [5]:
sentences = df_tweet_tags['Tweets'].values.tolist()

In [6]:
sentences

[['Tumben',
  'xl',
  'banter',
  'go',
  'download',
  ',',
  ',',
  'download',
  'video',
  'tom',
  'jerry',
  'ya',
  'masuk',
  'kie',
  'https://t.co/SmrXmut7wk'],
 ['@myXL',
  '@myXLCare',
  'knp',
  'xl',
  'skr',
  'jd',
  'susah',
  'signal',
  'ya',
  ',',
  'dan',
  'download',
  'paling',
  'banter',
  '70kbps',
  '.',
  'Sangat',
  'disayangkan'],
 ['Lak',
  'download',
  'nggawe',
  'cl',
  'iku',
  'subuh',
  'baru',
  'banter',
  '.',
  'Lak',
  'sore-bengi',
  'lemot',
  "''",
  '@kecepoood',
  ':',
  'XL',
  'labil',
  'donlod',
  'munggah',
  'mudun',
  '😩',
  '"'],
 ['xl',
  'ngebut',
  '.',
  'Banter',
  'banget',
  '.',
  'Download',
  '0,7GB',
  'hanya',
  '26',
  'menit',
  '.'],
 ['Opone',
  'banter',
  'limite',
  'mek',
  '512kbps',
  'RT',
  '@aslisuroboyo:',
  'Nggawe',
  'XL',
  'unlimited',
  'streaming',
  'youtube',
  'karo',
  'download',
  'kenceng',
  '!',
  'Info',
  ':',
  'http://t.co/oXbsXMkMF1'],
 ['Lagi',
  'banter',
  'nih',
  'yank',
  '.',

In [7]:
# Count average sentence length 

all_tweets = df_tweet_tags['Tweets']
sent_length = 0
for tweet in all_tweets:
    sent_length += len(tweet)

avg_sent_length = sent_length/len(all_tweets)
print('Average tweet length: ', round(avg_sent_length, 2), 'tokens')

Average tweet length:  24.8 tokens


In [8]:
from collections import Counter

# get unique tags from the dataset
[k for k,v in Counter(all_tags).items()]

['ID', 'O', 'JV', 'EN', 'MIX-JV-EN', 'MIX-ID-JV', 'MIX-ID-EN']

In [9]:
# create list of tuples from all words and all tags
word_tag = list(zip(all_words, all_tags))
# print(word_tag)

In [10]:
# convert list to dataframe
df_wordtag = pd.DataFrame(word_tag, columns=['Token','Label'])
df_wordtag

Unnamed: 0,Token,Label
0,Tumben,ID
1,xl,O
2,banter,JV
3,go,JV
4,download,EN
...,...,...
133786,apalagi,ID
133787,ini,ID
133788,video,EN
133789,call,EN


In [11]:
len(word_tag)

133791

In [12]:
all_tokens = df_wordtag['Token']
token_length = 0
for token in all_tokens:
    token_length += len(token)

avg_token_length = token_length/len(word_tag)
print('Average token length: ', round(avg_token_length, 2), 'characters')

Average token length:  4.57 characters


In [13]:
# Dataset info

num_token = len(df_wordtag)
num_of_unique_token = df_wordtag['Token'].nunique()
num_tweets = len(dt)

print('Number of tweets: ', num_tweets)
print('Number of tokens: ', num_token)
print('Number of unique tokens: ', num_of_unique_token)

Number of tweets:  5394
Number of tokens:  133791
Number of unique tokens:  26889


In [14]:
# print('Number of data per label')
counts = df_wordtag.Label.value_counts()
percent = df_wordtag.Label.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.DataFrame({'counts': counts, 'per': percent})

Unnamed: 0,counts,per
ID,67682,50.6%
O,30728,23.0%
EN,16108,12.0%
JV,14624,10.9%
MIX-ID-EN,2966,2.2%
MIX-ID-JV,884,0.7%
MIX-JV-EN,799,0.6%


In [15]:
labels = ['ID', 'JV', 'EN', 'MIX-JV-EN', 'MIX-ID-JV', 'MIX-ID-EN']
cmi_calculation.calculate_cmi(filename, labels)

CMI:  24.31
CMI Mixed:  24.95


In [16]:
all_tokens = []
all_tags = []
for tokens, tags in data[0]:
    all_tokens.append(tokens)
    all_tags.append(tags)

In [17]:
# get all ngrams from data
all_ngrams = []
for taglist in all_tags:
    all_ngrams.extend(nltk.ngrams(taglist, 3))

#all_ngrams

In [18]:
counts = collections.Counter()
for taglist in all_tags:
    counts.update(nltk.ngrams(taglist, 3))

In [19]:
mocom = counts.most_common(30)

In [20]:
tag_transition_list = pd.DataFrame(mocom, columns=['Transition','Frequency'])

In [21]:
tag_transition_list

Unnamed: 0,Transition,Frequency
0,"(ID, ID, ID)",31978
1,"(O, ID, ID)",9092
2,"(ID, ID, O)",8831
3,"(ID, O, ID)",6747
4,"(EN, EN, EN)",5435
5,"(JV, JV, JV)",5146
6,"(O, O, O)",3411
7,"(ID, ID, EN)",2738
8,"(ID, O, O)",2495
9,"(O, O, ID)",2285
