# word collocation mining

In [1]:
import math

In [2]:
tweet_filepath = "tweets.txt"

In [3]:
tweetcount = 0
with open(tweet_filepath) as file:
    for line in file:
        tweetcount +=1
print (tweetcount)

1000000


In [4]:
def get_itemset_frequency(tweet_filepath):
    item_freq = {}
    with open(tweet_filepath) as f:
        for line in f:
            text = line.strip().split()
            items = [x for x in text if not x.startswith("@") and not x.startswith("#")]
            for item in set(items): #ignore duplicate words in a line
                if item not in item_freq:
                    item_freq[item]  =1
                else:
                    item_freq[item] +=1
    return item_freq

def get_bigram_frequency(tweet_filepath):
    bigram_freq = {}

    with open(tweet_filepath) as f:
        for line in f:
            bigram_list = []
            text = line.strip().split()
            for i in range(len(text)-1):
                if not text[i].startswith('#') and \
                   not text[i].startswith('@') and \
                   not text[i+1].startswith('#') and \
                   not text[i+1].startswith('@'):
                    bigramset = (text[i],text[i+1])
                    bigram_list.append(bigramset)
            for bigramset in set(bigram_list):
                if bigramset not in bigram_freq:
                    bigram_freq[bigramset] =1
                else:
                    bigram_freq[bigramset] +=1
    return bigram_freq

In [7]:
#Save item frequency
itemset= get_itemset_frequency(tweet_filepath)

#Save word pair frequency
bigram_freq = get_bigram_frequency(tweet_filepath)

In [6]:
for item, freq in sorted(itemset.items(), key = lambda x: x[1], reverse = True)[:10]:
    print(item, freq)

i 161290
to 108698
a 108676
the 99682
you 86025
me 80856
my 69715
and 67236
is 58984
in 48043


In [8]:
len(bigram_freq)

2198740

In [10]:
for bigram, freq in sorted(bigram_freq.items(), key = lambda x: x[1], reverse = True)[:10]:
    print(bigram, freq)

('in', 'the') 10373
('i', "don't") 9304
('i', 'love') 9217
('to', 'be') 8429
('i', 'just') 7850
('i', 'have') 7103
('i', 'want') 6721
('if', 'you') 6626
('follow', 'me') 6053
('and', 'i') 6030


In [14]:
# mutual information calculation in each cell 
def mi_cell_formula_calculate(bigramfreq,freq_x,freq_y,totalcount):
    if bigramfreq == 0:
        return 0
    else:    
        mi_cell = bigramfreq / totalcount * math.log(bigramfreq/totalcount/((freq_x/totalcount)*(freq_y/totalcount)))
        return mi_cell

In [18]:
#Pointwise mutual information
def Pointwise_mutual_information_calculation(unigram,bigram,tweetcount):
    pmi_per_bigram={}
    for bigramset,bigramfreq in bigram.items():
        if bigramfreq <100:
            continue
        x = bigramset[0]
        y = bigramset[1]
        freq_x=unigram[x] 
        
        freq_y=unigram[y]
        pmi_per_bigram[bigramset]=math.log(bigramfreq/tweetcount/((freq_x/tweetcount)*(freq_y/tweetcount)))
    return pmi_per_bigram

#Chi-square 
def Chi_square_calculation(unigram,bigram,tweetcount):
    chi2_per_bigram={}
    for bigramset,bigramfreq in bigram.items():
        if bigramfreq <100:
            continue
        x,y =bigramset
        freq_x =unigram[x]
        freq_y =unigram[y]
        freq_non_x = tweetcount - freq_x
        freq_non_y = tweetcount - freq_y
        
        expected_value_xy = freq_x * freq_y / tweetcount
        expected_value_non_x_y = freq_non_x* freq_y / tweetcount
        expected_value_x_non_y = freq_non_y* freq_x / tweetcount
        expected_value_non_x_non_y  = freq_non_x * freq_non_y / tweetcount

        bigramfreq_non_x_y = freq_y - bigramfreq
        bigramfreq_x_non_y = freq_x - bigramfreq
        bigramfreq_non_x_non_y = tweetcount - freq_x - freq_y + bigramfreq
        chi2_per_bigram[bigramset]=(bigramfreq-expected_value_xy)**2/expected_value_xy +\
                                   (bigramfreq_non_x_y - expected_value_non_x_y) **2/expected_value_non_x_y +\
                                   (bigramfreq_x_non_y - expected_value_x_non_y) **2/expected_value_x_non_y +\
                                   (bigramfreq_non_x_non_y - expected_value_non_x_non_y) **2/expected_value_non_x_non_y 
    return chi2_per_bigram

#Mutual information
def mutual_information_calculation(unigram,bigram,tweetcount):
    mi_per_bigram = {}
    for bigramset,bigramfreq in bigram.items():
        if bigramfreq <100:
            continue
        x,y=bigramset
        freq_x=unigram[x]
        freq_y=unigram[y]
        freq_non_x = tweetcount - freq_x
        freq_non_y = tweetcount - freq_y
        
        bigramfreq_non_x_y = freq_y - bigramfreq
        bigramfreq_x_non_y = freq_x - bigramfreq
        bigramfreq_non_x_non_y = tweetcount - freq_x - freq_y + bigramfreq
        mi_per_bigram[bigramset] = mi_cell_formula_calculate(bigramfreq,freq_x,freq_y,tweetcount)+\
                                   mi_cell_formula_calculate(bigramfreq_non_x_y,freq_non_x,freq_y,tweetcount)+\
                                   mi_cell_formula_calculate(bigramfreq_x_non_y,freq_x,freq_non_y,tweetcount)+\
                                   mi_cell_formula_calculate(bigramfreq_non_x_non_y,freq_non_x,freq_non_y,tweetcount)
    return mi_per_bigram

In [16]:
pmi_tweet = Pointwise_mutual_information_calculation(itemset,bigram_freq,tweetcount)

chi_tweet = Chi_square_calculation(itemset,bigram_freq,tweetcount)

mi_tweet = mutual_information_calculation(itemset,bigram_freq,tweetcount)

In [17]:
# sort bigrams in pmi_per_bigram by their PMI from highest to lowest. Show the top 100 bigrams.
print("**************Based on Pointwise mutual information**********")
pmi_top = []
for bigram,freq in sorted(pmi_tweet.items(), key = lambda x: x[1], reverse = True)[:100]:
    print(bigram)
    pmi_top.append(bigram)

**************Based on Pointwise mutual information**********
('a.jalan2', 'b.tepar')
('b.tepar', 'c.ngumpul2')
('pengen?', 'a.jalan2')
('peanut', 'butter')
('a.ya', 'b.gak')
('hannah', 'montana')
('hobby', 'lobby')
('harry', 'potter')
('taco', 'bell')
('gugu', 'morreu')
('warna', 'sepatu')
('terlatih', 'sakit')
('ashton', 'irwin')
('testing', '1404190823')
('testing', '1404191460')
('atau', 'maingame')
('nontonkartun', 'atau')
('24', 'horas,')
('dengerin', 'lagu')
('teen', 'wolf')
('ice', 'cream')
('calum', 'hood')
('restoring', 'awesome')
('michael', 'clifford')
('buenas', 'noches')
('boa', 'noite')
('social', 'media')
('wide', 'awake')
('buenos', 'dias')
('low', 'key')
('difference', 'between')
('sering', 'habis')
('udah', 'terlatih')
('awesome', 'cars')
('sakit', 'hati')
('je', 'vais')
('hobi', 'main')
('pengen', 'bilang')
('free', 'agency')
('ke', 'haters?')
('awkward', 'moment')
('puta', 'madre')
('luke', 'hemmings')
('free', 'agent')
('punya', 'sahabat')
('lagi', 'pengen?')
('ta

In [75]:
#Frequency of word in top20 bigram of pmi_tweet
for bgram in pmi_top[:20]:
    for x in bgram:
        print("{}:".format(x),itemset[x])

b.tepar: 147
c.ngumpul2: 147
a.jalan2: 147
b.tepar: 147
pengen?: 159
a.jalan2: 147
peanut: 140
butter: 174
a.ya: 210
b.gak: 191
hannah: 205
montana: 135
hobby: 219
lobby: 161
harry: 399
potter: 114
taco: 351
bell: 273
gugu: 171
morreu: 376
warna: 412
sepatu: 136
terlatih: 234
sakit: 653
ashton: 664
irwin: 161
testing: 712
1404190823: 349
testing: 712
1404191460: 301
nontonkartun: 114
atau: 729
atau: 729
maingame: 114
24: 794
horas,: 406
dengerin: 288
lagu: 377
teen: 736
wolf: 663


In [67]:
# sort bigrams in chi2_per_bigram by their Chi-square from highest to lowest. Show the top 100 bigrams.
print("**************Based on Chi-square **********")
chi_top = []
for bigram, freq in sorted(chi_tweet.items(), key = lambda x: x[1], reverse = True)[:100]:
    print(bigram)
    chi_top.append(bigram)

**************Based on Chi-square **********
('b.tepar', 'c.ngumpul2')
('a.jalan2', 'b.tepar')
('pengen?', 'a.jalan2')
('luke', 'hemmings')
('a.ya', 'b.gak')
('teen', 'wolf')
('taco', 'bell')
('testing', '1404190823')
('hobby', 'lobby')
('24', 'horas,')
('peanut', 'butter')
('testing', '1404191460')
('hannah', 'montana')
('ice', 'cream')
('terlatih', 'sakit')
('michael', 'clifford')
('boa', 'noite')
('gugu', 'morreu')
('free', 'agency')
('harry', 'potter')
('buenas', 'noches')
('ashton', 'irwin')
('calum', 'hood')
('warna', 'sepatu')
('fall', 'asleep')
('social', 'media')
('nontonkartun', 'atau')
('atau', 'maingame')
('restoring', 'awesome')
('sakit', 'hati')
('dengerin', 'lagu')
('udah', 'terlatih')
('wide', 'awake')
('from', '5sos')
('happy', 'birthday')
('hemmings', 'from')
('buenos', 'dias')
('please', 'follow')
('awesome', 'cars')
('new', 'song')
('difference', 'between')
('looking', 'forward')
('noticias', 'las')
('watch', 'this!')
('each', 'other')
('awkward', 'moment')
('hobi',

In [76]:
#Frequency of word in top20 bigram of chi_tweet
for bgram in chi_top[:20]:
    for x in bgram:
        print("{}:".format(x),itemset[x])

b.tepar: 147
c.ngumpul2: 147
a.jalan2: 147
b.tepar: 147
pengen?: 159
a.jalan2: 147
luke: 3283
hemmings: 2440
a.ya: 210
b.gak: 191
teen: 736
wolf: 663
taco: 351
bell: 273
testing: 712
1404190823: 349
hobby: 219
lobby: 161
24: 794
horas,: 406
peanut: 140
butter: 174
testing: 712
1404191460: 301
hannah: 205
montana: 135
ice: 732
cream: 443
terlatih: 234
sakit: 653
michael: 1040
clifford: 378
boa: 816
noite: 604
gugu: 171
morreu: 376
free: 2419
agency: 719
harry: 399
potter: 114


In [68]:
# sort bigrams in mi_per_bigram by their mutual information from highest to lowest. Show the top 100 bigrams.
print("**************Based on Mutual information**********")
mut_top =[]
for bigram, freq in sorted(mi_tweet.items(), key = lambda x: x[1], reverse = True)[:100]:
    print(bigram)
    mut_top.append(bigram)

**************Based on Mutual information**********
('i', 'a')
('i', 'the')
('luke', 'hemmings')
('you', 'i')
('a', 'a')
('me', 'i')
('is', 'i')
('to', 'a')
('please', 'follow')
('you', 'the')
('you', 'you')
('to', 'and')
('follow', 'me')
('and', 'to')
('going', 'to')
('it', 'i')
('go', 'buy')
('you', 'a')
('so', 'much')
('from', '5sos')
('i', 'so')
('me', 'the')
('right', 'now')
('me', 'you')
('hemmings', 'from')
('on', 'i')
('you', 'to')
('to', 'you')
('no', 'i')
('me', 'a')
('me', 'my')
('you', 'my')
('happy', 'birthday')
('new', 'song')
('and', 'me')
('a', 'lot')
('in', 'to')
('i', 'love')
('is', 'to')
('and', 'a')
('this', 'i')
('feel', 'like')
('is', 'me')
('is', 'you')
('voy', 'a')
('you', 'is')
('it', 'the')
('me', 'is')
('me', 'to')
('to', 'no')
('a', 'no')
('want', 'to')
('to', 'it')
('free', 'agency')
('i', 'hate')
('if', 'you')
('it', 'a')
('have', 'i')
('that', 'to')
('at', 'least')
('so', 'the')
('in', 'you')
('the', 'best')
('need', 'to')
('trying', 'to')
('wake', 'up')


In [77]:
#Frequency of word in top20 bigram of mi_tweet
for bgram in mut_top[:20]:
    for x in bgram:
        print("{}:".format(x),itemset[x])

i: 161290
a: 108676
i: 161290
the: 99682
luke: 3283
hemmings: 2440
you: 86025
i: 161290
a: 108676
a: 108676
me: 80856
i: 161290
is: 58984
i: 161290
to: 108698
a: 108676
please: 9079
follow: 12510
you: 86025
the: 99682
you: 86025
you: 86025
to: 108698
and: 67236
follow: 12510
me: 80856
and: 67236
to: 108698
going: 9055
to: 108698
it: 45851
i: 161290
go: 19582
buy: 6080
you: 86025
a: 108676
so: 41260
much: 8250
from: 11501
5sos: 2972


In [69]:
#compute bigram set in both two top lists

count =0
for item in chi_top:
    if item in pmi_top:
        count+=1
print ("Pointwise Mutual Information vs Chi-square:{}".format(count) )

count =0
for item in mut_top:
    if item in pmi_top:
        count+=1
print ("Pointwise Mutual Information vs Mutual information:{}".format(count) )

count =0
for item in chi_top:
    if item in mut_top:
        count+=1
print ("Mutual information vs Chi-square:{}".format(count) )

Pointwise Mutual Information vs Chi-square:72
Pointwise Mutual Information vs Mutual information:4
Mutual information vs Chi-square:21
