# Collocations

In [None]:
#load all libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
import string

Data: https://www.kaggle.com/datafiniti/hotel-reviews/data

In [None]:
#load reviews data
reviews = pd.read_csv('cleanaarogya.csv')

In [None]:
reviews.head(2)

Unnamed: 0,REVIEW
0,cowinapp not ios aarogyasetu user register moh...
1,corona warn app germany permanently show risk ...


Extract only the reviews...

In [None]:
comments = reviews['REVIEW']

## Preprocessing

In [None]:
#function to remove non-ascii characters
def _removeNonAscii(s): return "".join(i for i in s if ord(i)<128)

In [None]:
comments = comments.astype('str')

In [None]:
#remove non-ascii characters
comments = comments.map(lambda x: _removeNonAscii(x))

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#get stop words of all languages
STOPWORDS_DICT = {lang: set(nltk.corpus.stopwords.words(lang)) for lang in nltk.corpus.stopwords.fileids()}

In [None]:
#function to detect language based on # of stop words for particular language
def get_language(text):
    words = set(nltk.wordpunct_tokenize(text.lower()))
    lang = max(((lang, len(words & stopwords)) for lang, stopwords in STOPWORDS_DICT.items()), key = lambda x: x[1])[0]
    if lang == 'english':
        return True
    else:
        return False

In [None]:
#filter for only english comments
eng_comments=comments[comments.apply(get_language)]

In [None]:
eng_comments.head()

0     cowinapp not ios aarogyasetu user register moh...
10    swarraj nadjanadika sunetrac registration aaro...
20    covidvaccine registration open senior citizen ...
21        sivismyname not function try aarogya setu app
23    rupagulab successfully small pox vaccine polio...
Name: REVIEW, dtype: object

In [None]:
#drop duplicates
eng_comments.drop_duplicates(inplace=True)

In [None]:
#load spacy
nlp = spacy.load('en')

In [None]:
#function to clean and lemmatize comments
def clean_comments(text):
    #remove punctuations
    regex = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    nopunct = regex.sub(" ", str(text))
    #use spacy to lemmatize comments
    doc = nlp(nopunct, disable=['parser','ner'])
    lemma = [token.lemma_ for token in doc]
    return lemma

In [None]:
#apply function to clean and lemmatize comments
lemmatized = eng_comments.map(clean_comments)

In [None]:
12#make sure to lowercase everything
lemmatized = lemmatized.map(lambda x: [word.lower() for word in x])

In [None]:
lemmatized.head()

0     [cowinapp, not, ios, aarogyasetu, user, regist...
10    [swarraj, nadjanadika, sunetrac, registration,...
20    [covidvaccine, registration, open, senior, cit...
21    [sivismyname, not, function, try, aarogya, set...
23    [rupagulab, successfully, small, pox, vaccine,...
Name: REVIEW, dtype: object

In [None]:
#turn all comments' tokens into one single list
unlist_comments = [item for items in lemmatized for item in items]

## Initialize NLTK's Bigrams/Trigrams Finder

In [None]:
bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

In [None]:
bigramFinder = nltk.collocations.BigramCollocationFinder.from_words(unlist_comments)
trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(unlist_comments)

## 1. Counting Frequencies of Adjacent Words
- Main idea: simply order by frequency
- Issues: too sensitive to very frequent pairs and pronouns/articles/prepositions come up often
- Solution: filter for only adjectives and nouns

In [None]:
bigram_freq = bigramFinder.ngram_fd.items()

In [None]:
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)

In [None]:
bigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,bigram,freq
0,"(aarogya, setu)",10069
1,"(setu, app)",5673
2,"(app, not)",1368
3,"(aarogyasetu, app)",1028
4,"(download, aarogya)",959


In [None]:
bigramFreqTable[:10]

Unnamed: 0,bigram,freq
12,"(aarogya, setu)",10069
48,"(setu, app)",5673
64,"(app, not)",1368
70,"(aarogyasetu, app)",1028
574,"(download, aarogya)",959
13,"(setu, not)",734
2994,"(use, aarogya)",400
6,"(mohfw, india)",390
925,"(not, work)",371
540,"(not, aarogya)",367


In [None]:
#get english stopwords
en_stopwords = set(stopwords.words('english'))

In [None]:
#function to filter for ADJ/NN bigrams
def rightTypes(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
#filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]

In [None]:
filtered_bi[:10]

Unnamed: 0,bigram,freq
12,"(aarogya, setu)",10069
48,"(setu, app)",5673
70,"(aarogyasetu, app)",1028
574,"(download, aarogya)",959
2994,"(use, aarogya)",400
1291,"(app, httpst)",340
1600,"(contact, trace)",326
439,"(self, declaration)",298
3300,"(smart, phone)",291
440,"(declaration, form)",288


In [None]:
trigram_freq = trigramFinder.ngram_fd.items()

In [None]:
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

In [None]:
trigramFreqTable.head().reset_index(drop=True)

Unnamed: 0,trigram,freq
0,"(aarogya, setu, app)",5588
1,"(download, aarogya, setu)",959
2,"(setu, app, not)",836
3,"(aarogya, setu, not)",716
4,"(use, aarogya, setu)",399


In [None]:
trigramFreqTable[:10]

Unnamed: 0,trigram,freq
50,"(aarogya, setu, app)",5588
610,"(download, aarogya, setu)",959
67,"(setu, app, not)",836
12,"(aarogya, setu, not)",716
3414,"(use, aarogya, setu)",399
575,"(not, aarogya, setu)",364
3096,"(create, aarogya, setu)",274
466,"(self, declaration, form)",248
8159,"(setu, app, mandatory)",223
24631,"(setu, app, help)",207


In [None]:
def rightTypesTri(ngram):
    if '-pron-' in ngram or '' in ngram or ' 'in ngram or '  ' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords:
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False

In [None]:
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypesTri(x))]

In [None]:
filtered_tri[:10]

Unnamed: 0,trigram,freq
50,"(aarogya, setu, app)",5588
610,"(download, aarogya, setu)",959
3414,"(use, aarogya, setu)",399
466,"(self, declaration, form)",248
8159,"(setu, app, mandatory)",223
24631,"(setu, app, help)",207
1080,"(install, aarogya, setu)",204
1761,"(contact, trace, app)",184
41401,"(breach, aarogya, setu)",181
1406,"(setu, app, httpst)",172


In [None]:
freq_bi = filtered_bi[:20].bigram.values

In [None]:
freq_tri = filtered_tri[:20].trigram.values

## 2. PMI

In [None]:
bigramFinder.apply_freq_filter(20)

In [None]:
bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)

In [None]:
bigramPMITable[:10]

Unnamed: 0,bigram,PMI
0,"(cyberbullye, cyberthreat)",13.573951
1,"(cyberattack, cyberbullye)",13.442707
2,"(cybercrime, cyberattack)",13.375593
3,"(tik, tok)",13.322413
4,"(chairman, empower)",13.243803
5,"(powerful, companion)",12.756815
6,"(niti, aayog)",12.689701
7,"(datasecurity, dataprivacy)",12.644341
8,"(architecture, indulge)",12.637798
9,"(dc, gurugram)",12.566862


In [None]:
trigramFinder.apply_freq_filter(20)

In [None]:
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

In [None]:
trigramPMITable[:10]

Unnamed: 0,trigram,PMI
0,"(cyberattack, cyberbullye, cyberthreat)",27.016658
1,"(cybercrime, cyberattack, cyberbullye)",26.949544
2,"(chairman, empower, group)",25.255875
3,"(cybersecurity, cybercrime, cyberattack)",25.254399
4,"(dataprivacy, cybersecurity, cybercrime)",24.456033
5,"(datasecurity, dataprivacy, cybersecurity)",24.452757
6,"(ravi, shankar, prasad)",24.423183
7,"(powerful, companion, protect)",23.356762
8,"(hardeep, singh, puri)",22.759892
9,"(data, datasecurity, dataprivacy)",22.676034


In [None]:
pmi_bi = bigramPMITable[:20].bigram.values

In [None]:
pmi_tri = trigramPMITable[:20].trigram.values

## 3. t-test

In [None]:
bigramTtable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.student_t)), columns=['bigram','t']).sort_values(by='t', ascending=False)

In [None]:
bigramTtable.head()

Unnamed: 0,bigram,t
0,"(aarogya, setu)",96.265038
1,"(setu, app)",70.045601
2,"(aarogyasetu, app)",28.682156
3,"(download, aarogya)",28.316578
4,"(app, not)",24.922853


In [None]:
filteredT_bi = bigramTtable[bigramTtable.bigram.map(lambda x: rightTypes(x))]

In [None]:
filteredT_bi[:10]

Unnamed: 0,bigram,t
0,"(aarogya, setu)",96.265038
1,"(setu, app)",70.045601
2,"(aarogyasetu, app)",28.682156
3,"(download, aarogya)",28.316578
6,"(contact, trace)",17.999391
8,"(self, declaration)",17.224211
9,"(smart, phone)",16.98141
10,"(declaration, form)",16.94279
12,"(use, aarogya)",15.795171
15,"(create, aarogya)",15.273181


In [None]:
trigramTtable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.student_t)), columns=['trigram','t']).sort_values(by='t', ascending=False)

In [None]:
trigramTtable.head()

Unnamed: 0,trigram,t
0,"(aarogya, setu, app)",74.540741
1,"(download, aarogya, setu)",30.861601
2,"(setu, app, not)",28.295935
3,"(aarogya, setu, not)",26.070304
4,"(use, aarogya, setu)",19.806456


In [None]:
filteredT_tri = trigramTtable[trigramTtable.trigram.map(lambda x: rightTypesTri(x))]

In [None]:
filteredT_tri.head(10)

Unnamed: 0,trigram,t
0,"(aarogya, setu, app)",74.540741
1,"(download, aarogya, setu)",30.861601
4,"(use, aarogya, setu)",19.806456
7,"(self, declaration, form)",15.74796
8,"(setu, app, mandatory)",14.806361
9,"(setu, app, help)",14.279039
10,"(install, aarogya, setu)",14.227459
11,"(contact, trace, app)",13.561768
13,"(breach, aarogya, setu)",13.412753
15,"(immunity, download, aarogya)",13.070519


In [None]:
t_bi = filteredT_bi[:20].bigram.values

In [None]:
t_tri = filteredT_tri[:20].trigram.values

## 4. Chi-Square

In [None]:
bigramChiTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.chi_sq)), columns=['bigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [None]:
bigramChiTable.head(20)

Unnamed: 0,bigram,chi-sq
0,"(tik, tok)",256086.0
1,"(aarogya, setu)",247330.347503
2,"(cyberbullye, cyberthreat)",243890.476116
3,"(laxman, rekha)",243788.157344
4,"(cybercrime, cyberattack)",223186.796208
5,"(cyberattack, cyberbullye)",222680.869361
6,"(chairman, empower)",193999.393514
7,"(powerful, companion)",186866.269501
8,"(adapt, saptapati)",181530.798425
9,"(dc, gurugram)",175944.609899


In [None]:
trigramChiTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.chi_sq)), columns=['trigram','chi-sq']).sort_values(by='chi-sq', ascending=False)

In [None]:
trigramChiTable.head(20)

Unnamed: 0,trigram,chi-sq
0,"(cyberattack, cyberbullye, cyberthreat)",2715530000.0
1,"(cybercrime, cyberattack, cyberbullye)",2592097000.0
2,"(cybersecurity, cybercrime, cyberattack)",840525600.0
3,"(chairman, empower, group)",801320900.0
4,"(ravi, shankar, prasad)",742378300.0
5,"(dataprivacy, cybersecurity, cybercrime)",483302400.0
6,"(datasecurity, dataprivacy, cybersecurity)",459244700.0
7,"(hardeep, singh, puri)",447456400.0
8,"(powerful, companion, protect)",290034600.0
9,"(within, laxman, rekha)",226884600.0


In [None]:
chi_bi = bigramChiTable[:20].bigram.values

In [None]:
chi_tri = trigramChiTable[:20].trigram.values

## 5. Likelihood

In [None]:
bigramLikTable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.likelihood_ratio)), columns=['bigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [None]:
bigramLikTable.head()

Unnamed: 0,bigram,likelihood ratio
0,"(aarogya, setu)",81453.592967
1,"(setu, app)",26958.375274
2,"(mohfw, india)",3924.261659
3,"(declaration, form)",3852.122694
4,"(contact, trace)",3682.483445


In [None]:
filteredLik_bi = bigramLikTable[bigramLikTable.bigram.map(lambda x: rightTypes(x))]

In [None]:
filteredLik_bi.head(10)

Unnamed: 0,bigram,likelihood ratio
0,"(aarogya, setu)",81453.592967
1,"(setu, app)",26958.375274
3,"(declaration, form)",3852.122694
4,"(contact, trace)",3682.483445
5,"(self, declaration)",3674.42057
6,"(download, aarogya)",3506.76597
7,"(aarogyasetu, app)",3236.842788
8,"(smart, phone)",3075.159953
9,"(social, distancing)",2964.571205
10,"(employee, respect)",2406.115714


In [None]:
trigramLikTable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.likelihood_ratio)), columns=['trigram','likelihood ratio']).sort_values(by='likelihood ratio', ascending=False)

In [None]:
trigramLikTable.head()

Unnamed: 0,trigram,likelihood ratio
0,"(aarogya, setu, app)",162652.821953
1,"(download, aarogya, setu)",127521.1415
2,"(create, aarogya, setu)",123798.910611
3,"(breach, aarogya, setu)",123248.319367
4,"(register, aarogya, setu)",123246.714057


In [None]:
filteredLik_tri = trigramLikTable[trigramLikTable.trigram.map(lambda x: rightTypesTri(x))]

In [None]:
filteredLik_tri.head(20)

Unnamed: 0,trigram,likelihood ratio
0,"(aarogya, setu, app)",162652.821953
1,"(download, aarogya, setu)",127521.1415
3,"(breach, aarogya, setu)",123248.319367
4,"(register, aarogya, setu)",123246.714057
5,"(use, aarogya, setu)",123210.776837
6,"(setu, aarogya, setu)",123205.639788
7,"(install, aarogya, setu)",123205.621565
8,"(aarogya, setu, aap)",122894.353022
9,"(aarogya, setu, rti)",122791.442335
11,"(instal, aarogya, setu)",122704.57041


In [None]:
lik_bi = filteredLik_bi[:20].bigram.values

In [None]:
lik_tri = filteredLik_tri[:20].trigram.values

## Bigram Comparison

In [None]:
bigramsCompare = pd.DataFrame([freq_bi, pmi_bi, t_bi, chi_bi, lik_bi]).T

In [None]:
bigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [None]:
bigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(aarogya, setu)","(cyberbullye, cyberthreat)","(aarogya, setu)","(tik, tok)","(aarogya, setu)"
1,"(setu, app)","(cyberattack, cyberbullye)","(setu, app)","(aarogya, setu)","(setu, app)"
2,"(aarogyasetu, app)","(cybercrime, cyberattack)","(aarogyasetu, app)","(cyberbullye, cyberthreat)","(declaration, form)"
3,"(download, aarogya)","(tik, tok)","(download, aarogya)","(laxman, rekha)","(contact, trace)"
4,"(use, aarogya)","(chairman, empower)","(contact, trace)","(cybercrime, cyberattack)","(self, declaration)"
5,"(app, httpst)","(powerful, companion)","(self, declaration)","(cyberattack, cyberbullye)","(download, aarogya)"
6,"(contact, trace)","(niti, aayog)","(smart, phone)","(chairman, empower)","(aarogyasetu, app)"
7,"(self, declaration)","(datasecurity, dataprivacy)","(declaration, form)","(powerful, companion)","(smart, phone)"
8,"(smart, phone)","(architecture, indulge)","(use, aarogya)","(adapt, saptapati)","(social, distancing)"
9,"(declaration, form)","(dc, gurugram)","(create, aarogya)","(dc, gurugram)","(employee, respect)"


## Trigram Comparison

In [None]:
trigramsCompare = pd.DataFrame([freq_tri, pmi_tri, t_tri, chi_tri, lik_tri]).T

In [None]:
trigramsCompare.columns = ['Frequency With Filter', 'PMI', 'T-test With Filter', 'Chi-Sq Test', 'Likeihood Ratio Test With Filter']

In [None]:
trigramsCompare

Unnamed: 0,Frequency With Filter,PMI,T-test With Filter,Chi-Sq Test,Likeihood Ratio Test With Filter
0,"(aarogya, setu, app)","(cyberattack, cyberbullye, cyberthreat)","(aarogya, setu, app)","(cyberattack, cyberbullye, cyberthreat)","(aarogya, setu, app)"
1,"(download, aarogya, setu)","(cybercrime, cyberattack, cyberbullye)","(download, aarogya, setu)","(cybercrime, cyberattack, cyberbullye)","(download, aarogya, setu)"
2,"(use, aarogya, setu)","(chairman, empower, group)","(use, aarogya, setu)","(cybersecurity, cybercrime, cyberattack)","(breach, aarogya, setu)"
3,"(self, declaration, form)","(cybersecurity, cybercrime, cyberattack)","(self, declaration, form)","(chairman, empower, group)","(register, aarogya, setu)"
4,"(setu, app, mandatory)","(dataprivacy, cybersecurity, cybercrime)","(setu, app, mandatory)","(ravi, shankar, prasad)","(use, aarogya, setu)"
5,"(setu, app, help)","(datasecurity, dataprivacy, cybersecurity)","(setu, app, help)","(dataprivacy, cybersecurity, cybercrime)","(setu, aarogya, setu)"
6,"(install, aarogya, setu)","(ravi, shankar, prasad)","(install, aarogya, setu)","(datasecurity, dataprivacy, cybersecurity)","(install, aarogya, setu)"
7,"(contact, trace, app)","(powerful, companion, protect)","(contact, trace, app)","(hardeep, singh, puri)","(aarogya, setu, aap)"
8,"(breach, aarogya, setu)","(hardeep, singh, puri)","(breach, aarogya, setu)","(powerful, companion, protect)","(aarogya, setu, rti)"
9,"(setu, app, httpst)","(data, datasecurity, dataprivacy)","(immunity, download, aarogya)","(within, laxman, rekha)","(instal, aarogya, setu)"
