## 1. 計算語料庫特徵關鍵詞

In [1]:
import re, math
from collections import defaultdict
import nltk, pickle
import pprint
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

def words(text): return re.findall("([a-z'-]+|[0-9]+)", text.lower())

count_web1t = [ line.strip().split('\t') for line in open('count_1w.txt').readlines() ]
count_web1t = dict([ (word, int(count)) for word, count in count_web1t ])

##########################################################
count_how_to = defaultdict(lambda: defaultdict(lambda: 0))
##########################################################

chapterno = 1
for chapter in open('how.to.say.it.(raw).txt').read().split('<chapter>')[1:-1]:
    sentences = sent_detector.tokenize(chapter[chapter.index('\nPHRASES\n')+len('\nPHRASES\n'):])
    for sentence in sentences:
        for word in words(sentence):
            ###################################
            count_how_to[chapterno][word] += 1
            ###################################
    chapterno += 1
    
def is_key(word, count, total):
    if word not in count_web1t: return False
    rate = math.log10(count)-math.log10(total)-(math.log10(count_web1t[word])-12)
    return rate >= 1

#################################
keywords = defaultdict(lambda: defaultdict(lambda: 0))
for i in range(1, len(count_how_to)+1):
    total = sum(count_how_to[i].values())
    keyword = [ (word, count) for word, count in count_how_to[i].items() if is_key(word, count, total) and count>3]
    keyword = sorted(keyword, key=lambda x: -x[1])
    for k,v in keyword:
        keywords[i][k] = v
#################################

In [2]:
keywords[1]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'your': 23,
             'we': 19,
             'accept': 13,
             'me': 12,
             'pleasure': 10,
             'happy': 10,
             'pleased': 9,
             'dear': 9,
             'm': 7,
             's': 6,
             'able': 5,
             'sincerely': 5,
             'forward': 5,
             'discuss': 5,
             'say': 4,
             'thank': 4,
             'delighted': 4,
             'look': 4,
             'offer': 4,
             'much': 4,
             'invitation': 4,
             'thanks': 4,
             'enclosed': 4})

## 2. 計算各章的詞彙束（關鍵片語）
### 　各章關鍵片語的條件，次數出現超過平均值的章節
### 　例如 accept 出現在各章的次數 (1, 8), (2, 2), (15, 1)
### 　8 > (8+2+1)/3 所以 accept 是第１章的關鍵詞

In [3]:
import re
def words(text): return re.findall("([a-zA-Z'-]+|[0-9]+)", text)
def ngrams(tokens, n=4): return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n) ]  

In [4]:
chapters = '''01. Accept; 02. Confirm; 03. Adjust; 04. Advice; 05. Birthday; 06. Announce; 07. Apologize; 08. Apply; 09. Appointment; 10. Appreciate; 11. Late; 12. Collect; 13. Complaint; 14. Congrat; 15. Contract; 16. CoverLetters; 17. Credit; 18. Disagree; 19. ToEditor; 20. E-mail; 21. Employ; 22. Family; 23. Fax; 24. Follow-up; 25. RaiseFund; 26. Get-well; 27. Goodwill; 28. Holiday; 29. Instruct; 30. Introduce; 31. Invite; 32. Love; 33. Memos; 34. ToNeighbor; 35. Order; 36. Club; 37. Query; 38. Refer; 39. Refuse; 40. Report; 41. Request; 42. Respond; 43. Resume; 44. Sales; 45. Sensitive; 46. Sympathy; 47. Thank-you; 48. Travel; 49. Wedding; 50. Welcome'''
chapters = [ x.split() for x in chapters.split('; ')]
chaptername = dict([ (int(x[:-1]), x+y) for x, y in chapters ])

count_chapter = defaultdict(lambda: defaultdict(lambda: 0))

##################################################
# calculate counts of each chapters based on keys
appear_times = []
for i in range(1, len(chapters)+1):
    appear_time = 0
    for j in range(1, len(keywords)+1):
        chapt = chapters[i-1][1].lower()
        if chapt in keywords[j]:
            count_chapter[chapt][j] = keywords[j][chapt]
            appear_time += 1
        else:
            count_chapter[chapt][j] = 0
    appear_times.append(appear_time)

# calculate total counts of all chapters for each keys
chapter_total_counts = defaultdict(lambda: 0)
for i in range(1, len(count_chapter)+1):
    for k,v in count_chapter.items():
        chapter_total_counts[k] += v[i]
# normalize to average value        
chapterno = 0
for k,v in chapter_total_counts.items():
    if appear_times[chapterno] != 0:
        chapter_total_counts[k] = v / appear_times[chapterno]
    chapterno += 1

# compare with the average value to decide each chapter's key
chapt_key = []
for k,v in count_chapter.items():
    for chapt, cnts in v.items():
        if cnts > chapter_total_counts[k]:
            chapt_key.append((chapt, k))
##################################################

In [5]:
chapt_key

[(1, 'accept'),
 (2, 'confirm'),
 (10, 'appreciate'),
 (39, 'appreciate'),
 (41, 'appreciate'),
 (47, 'appreciate'),
 (12, 'credit'),
 (17, 'credit'),
 (28, 'family'),
 (46, 'family'),
 (5, 'love'),
 (22, 'love'),
 (32, 'love'),
 (35, 'order'),
 (40, 'report'),
 (49, 'wedding')]

## 3. 做各章關鍵詞的 cluster analysis （利用 Linggle 的 A and B 查詢）

In [6]:
from linggle import Linggle
from collections import defaultdict
import pprint

linggle = Linggle()

def ngramcount(query):
    return linggle[query]

###################################################
accept_words = list(set([y for x,y in chapt_key]))
###################################################
print (accept_words)
print ()

and_grams = ngramcount('%s and %s'%('/'.join(accept_words), '/'.join(accept_words)))
pprint.pprint (and_grams)

['accept', 'credit', 'wedding', 'family', 'love', 'order', 'confirm', 'report', 'appreciate']

[['family and love', 30965],
 ['credit and credit', 29086],
 ['love and appreciate', 17201],
 ['love and love', 16494],
 ['love and family', 15699],
 ['report and credit', 13485],
 ['love and accept', 10694],
 ['family and family', 10223],
 ['order and credit', 9542],
 ['accept and love', 5243],
 ['appreciate and love', 4868],
 ['accept and appreciate', 4647],
 ['report and order', 3850],
 ['order and order', 3665],
 ['report and report', 2792],
 ['wedding and wedding', 2650],
 ['appreciate and accept', 1679],
 ['order and family', 1618],
 ['family and wedding', 1522],
 ['wedding and family', 1391],
 ['order and confirm', 1101],
 ['order and report', 844],
 ['accept and confirm', 780],
 ['family and order', 744],
 ['credit and report', 723],
 ['report and confirm', 577],
 ['report and accept', 559],
 ['order and accept', 520],
 ['love and wedding', 515],
 ['confirm and report', 402],
 ['accep