<h2>cc_n_gram_v2.ipynb</h2>
<br>Author: Frank Greco
<br>Create data and label dictionaries suitable for vectorization
<br>Example n-gram extraction
<br>Example k-means calculation
<br>Note: Input transcript files need to be stripped of annotations and foreign languages

In [1]:
#!/usr/bin/env python
# coding: utf-8
from __future__ import print_function

__source__ = "cc_n_gram_v2.py"
__author__ = "Frank J. Greco"
__copyright__ = "Copyright 2015-2018, Frank J. Greco"
__credits__ = []
__license__ = "GPL"
__version__ = "2"
__maintainer__ = "Frank J. Greco"
__email__ = ""
__status__ = "Development"

import json

from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import HashingVectorizer

import datetime

###########################################################
# extract X array of documents y array of labels
# Returns  X and y
###########################################################

def extractXy(text_dict):
    print("\n*** Begin extractXy ***\n")
    y = []
    X = []

    print("\n[Begin text_dict (key, doc[0:10], taglist)]\n")
    for k in text_dict.keys():
        y.append(text_dict[k]['tags'])
        X.append(text_dict[k]['text'])
        print(k, text_dict[k]['text'][0:10], text_dict[k]['tags'])

    print("\n[End of text_dict]")

    print("\nReturning X,y (Here are first 10 characters of the first 3 X records and the assoicated y values:)\n")
    for a, b in zip(X, y)[0:3]:
        print(a[0:10], b)
    print("\n*** End extractXy ***\n")
    return X, y

###########################################################
# encode_documents
# Returns encoded matrix using count_vectorizer
###########################################################

def encode_documents(X, n_gram_lower=1, n_gram_upper=1):
    print("\n*** Begin encode_documents ***\n")

    # vectorizer = CountVectorizer()
    vectorizer = CountVectorizer(ngram_range=(n_gram_lower, n_gram_upper), token_pattern=r'\b\w+\b', min_df=1)

    print(type(vectorizer))

    encoded_matrix = vectorizer.fit_transform(X)

    word_index = dict()
    for key in vectorizer.vocabulary_.keys():
        word_index[vectorizer.vocabulary_[key]] = key

    print("\n*** End encode_documents ***\n")

    return encoded_matrix, word_index


def print_encoded_matrix(encoded_matrix, word_index):
    print("\n*** Begin print_encoded_matrix ***\n")

    print(type(encoded_matrix))

    print("\nlength encoded_matrix:", len(encoded_matrix.toarray()))

    print("\nsparce encoded_matrix[0]:\n")

    print(encoded_matrix[0])

    print("\nsparce encoded_matrix.toarray()[][82600:82610]\n")

    for item in encoded_matrix.toarray():
        print(item[82600:82610])

    for x in range(82600, 82610):
        if x in word_index.keys():
            print(x, word_index[x])

    total = 0
    for item0 in encoded_matrix.toarray():
        for item1 in item0:
            total += item1

    print('total n_grams:', total)

############ n_gram_extractor ##############

def n_gram_extractor(text_dict, n_gram_lower=1, n_gram_upper=1):
    print("\n*** Begin n_gram_extractor ***")

    vectorizer = CountVectorizer(ngram_range=(n_gram_lower, n_gram_upper), token_pattern=r'\b\w+\b', min_df=1)

    analyze = vectorizer.build_analyzer()

    ng_list = []

    count = 0

    ng_total = 0

    reverse_index=[]

    for k in text_dict.keys():
        Z = text_dict[k]['text']

        ng = analyze(Z)

        ng_list.append(ng)

        print("Excerpt:", Z[0:20])

        print("Count: {} Doc Key: {}, n-gram len: {}, n-gram excerpt: {}\n".format(count, k, len(ng), ng[0:10]))

        ng_total += len(ng)

        reverse_index.append(k)

        count += 1

    print("ng_total: ", ng_total)

    print("\n*** End n_gram_extractor ***\n")

    return ng_list,reverse_index


###########################################################
# K-Mean
# Performs K-Mean cluster analysis on encoded matrix
###########################################################

def KM(encoded_matrix, n_clusters=3):
    print("\nBegin KM\n")

    from sklearn.cluster import KMeans

    print("K-means calculation...")
    km = KMeans(n_clusters=n_clusters)

    print("fit")
    print(km.fit(encoded_matrix))

    print("centers")
    print(km.cluster_centers_)

    print("len m", len(km.labels_))

    print("predict")
    prediction=km.predict(encoded_matrix)
    print (prediction)

    print("labels")
    print(km.labels_)
    print("\nEnd KM\n")

    return prediction

def main():
    print('Start time: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))

    with open('text_dict2.txt') as data_file:
        text_dict = json.load(data_file)

    X, y = extractXy(text_dict)

    ng_list, reverse_index = n_gram_extractor(text_dict, n_gram_lower=2, n_gram_upper=2)

    print("\nreverse_index\n")
    print (reverse_index)

    encoded_matrix, word_index = encode_documents(X, n_gram_lower=2, n_gram_upper=2)

    print_encoded_matrix(encoded_matrix, word_index)

    prediction=KM(encoded_matrix, n_clusters=3)

    for i in range(0,len(prediction)):
        print (i, reverse_index[i], prediction[i])

    s0 = set()
    s1 = set()
    for i, l in enumerate(zip(text_dict.keys(), prediction)):
        print(i, l)
        if l[1] == 0:
            s0.add(l[0])
        else:
            s1.add(l[0])
    print()
    print('s0:', s0)

    print()
    print('s1:', s1)

    print('End time: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))


if __name__ == "__main__":

    main( )

Start time: 2018-03-25 20:50:20

*** Begin extractXy ***


[Begin text_dict (key, doc[0:10], taglist)]

216  Hi, welco [u'TW24', u'TW02', u'TW17', u'TW01', u'TW02', u'TW17', u'TW02']
217   ...so yo [u'TW08m', u'TW24TW08', u'TW18', u'TW01', u'TW02', u'TW01']
214  Write at  [u'TW19', u'TW14', u'TW09', u'TW05', u'TW02', u'TW01', u'TW04']
813  Two times [u'TW24TW02', u'TW23', u'TW13', u'TW02']
213  Does ever [u'TW05', u'TW02']
218  This is a [u'TW06', u'TW15', u'TW13', u'TW15', u'TW02', u'TW06', u'TW15', u'TW09', u'TW08', u'TW13', u'TW15', u'TW02']
132  Emotions. [u'TW08', u'TW06', u'TW08', u'TW16', u'TW01']
131   Inner ci [u'TW15', u'TW24TW08', u'TW15', u'TW15', u'TW15', u'TW08']
137   At the e [u'TW24TW06', u'TW13', u'TW02', u'TW05', u'TW17', u'TW02', u'TW01', u'TW13', u'TW21', u'TW18', u'TW05', u'TW18', u'TW11', u'TW02', u'TW17', u'TW06', u'TW20', u'TW01']
135   Well, yo [u'TW10', u'TW13']
134   Are you  [u'TW24TW17', u'TW24TW17', u'TW15', u'TW23', u'TW22', u'TW09', u'TW05', u'TW02', u'

  ...so yo [u'TW08m', u'TW24TW08', u'TW18', u'TW01', u'TW02', u'TW01']
 Write at  [u'TW19', u'TW14', u'TW09', u'TW05', u'TW02', u'TW01', u'TW04']

*** End extractXy ***


*** Begin n_gram_extractor ***
Excerpt:  Hi, welcome to the 
Count: 0 Doc Key: 216, n-gram len: 1172, n-gram excerpt: [u'hi welcome', u'welcome to', u'to the', u'the library', u'library media', u'media center', u'center at', u'at the', u'the entrance', u'entrance we']

Excerpt:   ...so you can't co
Count: 1 Doc Key: 217, n-gram len: 2048, n-gram excerpt: [u'so you', u'you can', u'can t', u't count', u'count it', u'it out', u'out completely', u'completely hmm', u'hmm so', u'so we']

Excerpt:  Write at the top of
Count: 2 Doc Key: 214, n-gram len: 442, n-gram excerpt: [u'write at', u'at the', u'the top', u'top of', u'of your', u'your paper', u'paper the', u'the color', u'color that', u'that you']

Excerpt:  Two times two--
 Ag
Count: 3 Doc Key: 813, n-gram len: 1986, n-gram excerpt: [u'two times', u'times two', u'two ag

Excerpt:   Megan, what quadra
Count: 38 Doc Key: 990, n-gram len: 1896, n-gram excerpt: [u'megan what', u'what quadrant', u'quadrant am', u'am i', u'i going', u'going to', u'to be', u'be centered', u'centered in', u'in quadrant']

Excerpt:   Okay, so, write yo
Count: 39 Doc Key: 120, n-gram len: 2041, n-gram excerpt: [u'okay so', u'so write', u'write your', u'your conjectures', u'conjectures down', u'down denominators', u'denominators are', u'are repeating', u'repeating actually', u'actually let']

Excerpt:  A lot of people tho
Count: 40 Doc Key: 262, n-gram len: 3408, n-gram excerpt: [u'a lot', u'lot of', u'of people', u'people thought', u'thought the', u'the mass', u'mass of', u'of the', u'the water', u'water in']

Excerpt:   Do not put any but
Count: 41 Doc Key: 122, n-gram len: 2703, n-gram excerpt: [u'do not', u'not put', u'put any', u'any button', u'button caps', u'caps in', u'in the', u'the center', u'center of', u'of the']

Excerpt:  Now, what you just 
Count: 42 Doc Key: 12, n

Excerpt:   We haven't talked 
Count: 127 Doc Key: 66, n-gram len: 2019, n-gram excerpt: [u'we haven', u'haven t', u't talked', u'talked about', u'about some', u'some things', u'things that', u'that we', u'we do', u'do when']

Excerpt:  
  Starter-stopper,
Count: 128 Doc Key: 68, n-gram len: 2413, n-gram excerpt: [u'starter stopper', u'stopper opposite', u'opposite side', u'side okay', u'okay everybody', u'everybody has', u'has their', u'their marbles', u'marbles how', u'how many']

Excerpt:   Well I can start. 
Count: 129 Doc Key: 176, n-gram len: 2141, n-gram excerpt: [u'well i', u'i can', u'can start', u'start i', u'i just', u'just kind', u'kind of', u'of wanted', u'wanted to', u'to follow']

Excerpt:   Okay let's get bus
Count: 130 Doc Key: 173, n-gram len: 1796, n-gram excerpt: [u'okay let', u'let s', u's get', u'get busy', u'busy so', u'so before', u'before we', u'we change', u'change the', u'the car']

Excerpt:   Read.
 I want to g
Count: 131 Doc Key: 171, n-gram len: 1662, n-gra

Count: 178 Doc Key: 42, n-gram len: 1771, n-gram excerpt: [u'guys ready', u'ready yes', u'yes okay', u'okay number', u'number one', u'one start', u'start give', u'give your', u'your idea', u'idea read']

Excerpt:  Okay. So, what's go
Count: 179 Doc Key: 43, n-gram len: 2146, n-gram excerpt: [u'okay so', u'so what', u'what s', u's going', u'going on', u'on with', u'with this', u'this group', u'group you', u'you go']

Excerpt:   Okay. Look at your
Count: 180 Doc Key: 40, n-gram len: 1706, n-gram excerpt: [u'okay look', u'look at', u'at your', u'your music', u'music we', u'we are', u'are doing', u'doing four', u'four parts', u'parts obviously']

Excerpt: Okay. But before you
Count: 181 Doc Key: 41, n-gram len: 1943, n-gram excerpt: [u'okay but', u'but before', u'before you', u'you check', u'check over', u'over what', u'what you', u'you need', u'need to', u'to make']

Excerpt:  Okay, I can see two
Count: 182 Doc Key: 1, n-gram len: 2166, n-gram excerpt: [u'okay i', u'i can', u'can see', u'


*** End encode_documents ***


*** Begin print_encoded_matrix ***

<class 'scipy.sparse.csr.csr_matrix'>

length encoded_matrix: 217

sparce encoded_matrix[0]:

  (0, 3857)	1
  (0, 122592)	1
  (0, 114122)	1
  (0, 129733)	1
  (0, 61047)	1
  (0, 112894)	1
  (0, 111832)	1
  (0, 66240)	1
  (0, 131713)	1
  (0, 56844)	1
  (0, 73735)	1
  (0, 58306)	1
  (0, 116335)	1
  (0, 99480)	1
  (0, 87266)	1
  (0, 85849)	1
  (0, 74753)	1
  (0, 77489)	1
  (0, 17482)	1
  (0, 62660)	1
  (0, 87773)	1
  (0, 2951)	1
  (0, 129726)	1
  (0, 74789)	2
  (0, 132259)	1
  :	:
  (0, 57968)	1
  (0, 20119)	1
  (0, 87299)	1
  (0, 78689)	1
  (0, 106955)	1
  (0, 98365)	3
  (0, 70669)	1
  (0, 98994)	1
  (0, 47260)	1
  (0, 8418)	1
  (0, 34536)	1
  (0, 24246)	1
  (0, 93534)	1
  (0, 83484)	1
  (0, 124040)	1
  (0, 33845)	1
  (0, 108363)	1
  (0, 12223)	1
  (0, 21198)	1
  (0, 66038)	1
  (0, 60178)	1
  (0, 108958)	5
  (0, 116815)	1
  (0, 124546)	1
  (0, 49153)	1

sparce encoded_matrix.toarray()[][82600:82610]

[0 0 0 0 0 0 0 0 0 0]

210 1364 0
211 1365 0
212 1362 0
213 1363 0
214 1360 0
215 806 1
216 804 1
0 (u'216', 0)
1 (u'217', 1)
2 (u'214', 0)
3 (u'813', 0)
4 (u'213', 1)
5 (u'218', 0)
6 (u'132', 1)
7 (u'131', 0)
8 (u'137', 0)
9 (u'135', 1)
10 (u'134', 1)
11 (u'166', 0)
12 (u'24', 0)
13 (u'26', 2)
14 (u'27', 1)
15 (u'20', 1)
16 (u'21', 0)
17 (u'1078', 1)
18 (u'1835', 1)
19 (u'1838', 1)
20 (u'1839', 1)
21 (u'28', 0)
22 (u'29', 2)
23 (u'1071', 1)
24 (u'289', 0)
25 (u'1373', 0)
26 (u'1372', 0)
27 (u'1375', 1)
28 (u'4', 0)
29 (u'1377', 1)
30 (u'161', 2)
31 (u'280', 0)
32 (u'812', 0)
33 (u'163', 0)
34 (u'285', 1)
35 (u'1069', 1)
36 (u'996', 0)
37 (u'981', 0)
38 (u'990', 0)
39 (u'120', 0)
40 (u'262', 2)
41 (u'122', 1)
42 (u'12', 2)
43 (u'265', 1)
44 (u'128', 0)
45 (u'129', 0)
46 (u'269', 1)
47 (u'268', 0)
48 (u'118', 1)
49 (u'1060', 1)
50 (u'1062', 0)
51 (u'51', 0)
52 (u'50', 0)
53 (u'53', 0)
54 (u'52', 1)
55 (u'828', 1)
56 (u'825', 1)
57 (u'297', 0)
58 (u'826', 1)
59 (u'1380', 0)
60 (u'293', 1)
61 (u'1382', 1)
62 (u