In [1]:
import pandas as pd
import numpy as np
from glob2 import glob
import re
import nltk
import os 
os.chdir('/Users/gracelyons/Desktop/MSDS/Capstone/Transcripts/Teacher and Avatar/Temi Transcripts_02_16_23/')

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [2]:
transcripts = [trans for trans in sorted(glob('*.txt'))]
OHCO = ['speaker_id', 'line_num', 'sent_num', 'token_num']
speaker_1s = [218, 220, 314]
speaker_3s = [212, 325, 331, 335, 351, 258]

In [27]:
def create_files(trans_list, OHCO=OHCO, teacher_only = True, tokenizer = nltk.WhitespaceTokenizer(), 
                 ws = False, remove_blank_strings = True, add_stop_words = True, add_stems = True,
                 stems = [PorterStemmer(), SnowballStemmer('english'), LancasterStemmer()], 
                 alternate_rank = False, bag = 'speaker_id', count_method = 'n', 
                 tf_method = 'sum', tf_norm_k = 0.5, idf_method = 'standard', save_work = False):
    
    """
    This function will return six dataframes: 
        1. library: speaker_ids and corresponding file names
        2. docs: speaker_ids, line_nums, and a string of text for that line
        3. tokens: token broken out by speaker_id, line_num, sent_num, and token_num, it also has the part of speach 
        4. vocab: tokens aggregated with rank, frequency, stems, and some other metrics
        5. bow (bag-of-words): term_ids, with speaker_id and frequencies 
        6. tfidf (matrix): frequency matrix used for future analysis
        
    Potential Arguments:
        1. trans_list: list of files to be converted into a document table 
        2. OHCO: ordered hierarchy of content objects 
        3. teacher_only: do you only want just the teacher's text or teacher and avatar; default is True
        4. tokenizer: used to tokenize words in a sentence; default is nltk.WhitespaceTokenizer()
        5. ws: keep white space or not; default is False
        6. remove_blank_strings: remove blank strings of text; default is True
        7. add_stop_words: add identification of stop words to vocab table; default is True
        8. add_stems: add stems to vocab table; default is True
        9. stems: list of default stems to use to stem words in the vocab table;
                    default is [PorterStemmer(), SnowballStemmer('english'), LancasterStemmer()]
        10. alternate_rank: rank terms using frequency or value counts; default is False (rank using frequncey)
        11. bag: bag to group the terms together to create the bag of words and TFIDF; default is 'spaker_id'
                    can use any level in OHCO except 'token_num'
        12. count_method: count method to create tfidf, total times a string is used ('n') or unique strings ('c'); 
                    defualt is 'n'
        13. tf_method: ways to compute the term frequency; options are 'sum', 'max', 'log', 'raw', 'double_norm', 
                    or 'binary'; default is 'sum'
        14. tf_norm_k: only needed is tf_method is set to 'double_norm'; default is 0.5
        15. idf_method: method to create the inverse dense frequency; options are 'standard', 'max', and 'smooth';
                    default is 'standard'
        16. save_work: save work as csvs to working directory; need to set working directory using os.chdir before 
                    running the function; default is False 
    """
    
    my_lib = []
    my_doc = []
    
    def word_tokenize(x):
        if ws:
            s = pd.Series(nltk.pos_tag(tokenizer.tokenize(x))) 
        else:
            s = pd.Series(nltk.pos_tag(nltk.word_tokenize(x)))
        return s
    


    
    for trans_file in trans_list:
        
        # Get ID from filename
        speaker_id = int(trans_file.split('_')[0].replace('th', ''))
        print("File ID:", speaker_id)
        
        # Import file
        df = pd.read_csv(trans_file, engine = 'python', delimiter = '                       ', header = None) # 23 spaces
        df.columns = ['Speaker', 'Timestamp', 'line_str']
        df.line_str = df.line_str.str.strip()
        df['speaker_id'] = speaker_id
        df.index.name = 'line_num'
        
        # filter by speaker number to get only the teachers speaking
        if teacher_only:
            if speaker_id in speaker_1s:
                df = df[df.Speaker == 'Speaker 1:']
            elif speaker_id in speaker_3s:
                df = df[df.Speaker == 'Speaker 3:']
            else: 
                df = df[df.Speaker == 'Speaker 2:']
        
        # Group
        df = df.groupby(OHCO[:2]).line_str.apply(lambda x: '\n'.join(x)).to_frame()
        
        # Set index
        df.index.names = OHCO[:-2]

        # Register
        my_lib.append((speaker_id, trans_file))
        my_doc.append(df)

    docs = pd.concat(my_doc)
    library = pd.DataFrame(my_lib, columns=['speaker_id', 'book_file']).set_index('speaker_id')
    
    print('Tokenizing')
    # Lines to Sentences
    tokens = docs.line_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x))).stack()\
                .to_frame().rename(columns = {0: 'sent_str'})
    
    # Sentences to Tokens
    tokens = tokens.sent_str.apply(word_tokenize).stack().to_frame().rename(columns = {0: 'pos_tuple'})
    
    # Grab infor from tuple
    tokens['pos'] = tokens.pos_tuple.apply(lambda x: x[1])
    tokens['token_str'] = tokens.pos_tuple.apply(lambda x: x[0])
    
    # Add index and do some cleaning
    tokens.index.names = OHCO
    tokens['term_str'] = tokens['token_str'].str.lower().str.replace('[\W_]', '')
    if remove_blank_strings:
        tokens = tokens[tokens.term_str != '']
    
    # Reduce and extract vocabulary from tokens table 
    print('creating vocab')
    vocab = tokens.term_str.value_counts().to_frame()\
            .rename(columns = {'index': 'term_str', 'term_str': 'n'}).sort_index().reset_index()\
            .rename(columns = {'index': 'term_str'})
    vocab.index.name = 'term_id'
    vocab['num'] = vocab.term_str.str.match("\d+").astype('int')
    
    # Add stopwords
    if add_stop_words:
        sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns = ['term_str'])\
                .reset_index().set_index('term_str')
        sw.columns = ['dummy']
        sw.dummy = 1
        
        vocab['stop'] = vocab.term_str.map(sw.dummy).fillna(0).astype('int')
        
    # Add stems
    # Used Porter, Snowball, and Lancaster stemmers for this but there are other options
    if add_stems:
        for i, stem in enumerate(stems):
            str_stem = str(stems[i]).split('Stemmer')[0].split('.')[-1].replace('<', '').lower()
            vocab['stemmer_' + str_stem] = vocab.term_str.apply(stem.stem)
    
    # update token and vocab tables
    tokens['term_id'] = tokens.term_str.map(vocab.reset_index().set_index('term_str').term_id)
    vocab['pos_max'] = tokens.groupby(['term_id', 'pos']).count().iloc[:,0].unstack().idxmax(1)
    
    if alternate_rank:
        rank = vocab.n.value_counts().sort_index(ascending = False).reset_index().reset_index()\
                .rename(columns = {'level_0': 'term_rank', 'index': 'n', 'n': 'nn'}).set_index('n')
        vocab['term_rank'] = vocab.n.map(rank.term_rank) + 1
    else:
        vocab = vocab.sort_values('n', ascending = False).reset_index()
        vocab.index.name = 'term_rank'
        vocab = vocab.reset_index().set_index('term_id')
        vocab['term_rank'] = vocab['term_rank'] + 1
        
    vocab['p'] = vocab.n/vocab.n.sum() # prior, or marginal, probability of a term
    vocab['h'] = vocab.p * np.log2(1/vocab.p) # self entropy of each word
    
    # Create BOW
    print('creating bag-of-words')
    print('bag is: ' + bag)
    for i,j in enumerate(OHCO):
        if bag == j:
            bag = OHCO[:i+1]
    
    # default bag is speaker
    bow = tokens.groupby(bag + ['term_id']).term_id.count().to_frame().rename(columns = {'term_id': 'n'})
    bow['c'] = bow.n.astype('bool').astype('int')
    
    # Create TFIDF
    print('creating TFIDF')
    dtcm = bow[count_method].unstack().fillna(0).astype('int')
    
    print('tf method: ', tf_method)
    if tf_method == 'sum':
        tf = dtcm.T / dtcm.T.sum()
    elif tf_method == 'max':
        tf = dtcm.T / dtcm.T.max()
    elif tf_method == 'log':
        tf = np.log10(1 + dtcm.T)
    elif tf_method == 'raw':
        tf = dtcm.T
    elif tf_method == 'double_norm':
        tf = dtcm.T / dtcm.T.max()
        tf = tf_norm_k + (1 - tf_norm_k) * tf[tf > 0]
    elif tf_method == 'binary':
        tf = dtcm.T.astype('bool').astype('int')
    else:
        print('tf method selected is not supported. default used is "sum"')
        tf = dtcm.T / dtcm.T.sum()
    tf = tf.T
    
    df = dtcm[dtcm > 0].count()
    n = dtcm.shape[0]
    
    print('idf method: ', idf_method)
    if idf_method == 'standard':
        idf = np.log10(n / df)
    elif idf_method == 'max':
        idf = np.log10(df.max() / df)
    elif idf_method == 'smooth':
        idf = np.log10((1 + n) / (1 + df)) + 1
    else:
        print('idf method not supported. default used is "standard"')
        idf = np.log10(n / df)
        
    tfidf = tf * idf
    
    # update vocab and bow
    bow['tf'] = tf.stack()
    bow['tfidf'] = tfidf.stack()
    
    vocab['df'] = df
    vocab['idf'] = idf
    vocab['tfidf_mean'] = tfidf[tfidf > 0].mean().fillna(0)
    vocab['tfidf_sum'] = tfidf.sum()
    vocab['tfidf_median'] = tfidf[tfidf > 0].median().fillna(0)
    vocab['tfidf_max'] = tfidf.max()
    
    # save work as csvs 
    if save_work:
        print('saving work')
        if teacher_only:
            if os.path.exists('teacher_only/') == False:
                os.makedirs('teacher_only/')
            library.to_csv('teacher_only/LIB.csv')
            docs.to_csv('teacher_only/DOC.csv')
            tokens.to_csv('teacher_only/TOKEN.csv')
            bow.to_csv('teacher_only/BOW.csv')
            tfidf.to_csv('teacher_only/TFIDF.csv')
        else:
            if os.path.exists('teacher_avatar/') == False:
                os.makedirs('teacher_and_avatar/')
            library.to_csv('teacher_and_avatar/LIB.csv')
            docs.to_csv('teacher_and_avatar/DOC.csv')
            tokens.to_csv('teacher_and_avatar/TOKEN.csv')
            bow.to_csv('teacher_and_avatar/BOW.csv')
            tfidf.to_csv('teacher_and_avatar/TFIDF.csv')
    
    print("Done.")
    return library, docs, tokens, vocab, bow, tfidf

In [28]:
LIB, DOC, TOKEN, VOCAB, BOW, TFIDF = create_files(transcripts, save_work = True)

File ID: 201
File ID: 202
File ID: 203
File ID: 204
File ID: 205
File ID: 206
File ID: 207
File ID: 208
File ID: 209
File ID: 210
File ID: 211
File ID: 212
File ID: 213
File ID: 214
File ID: 215
File ID: 216
File ID: 217
File ID: 218
File ID: 219
File ID: 220
File ID: 221
File ID: 222
File ID: 223
File ID: 224
File ID: 225
File ID: 226
File ID: 227
File ID: 228
File ID: 229
File ID: 229
File ID: 301
File ID: 302
File ID: 303
File ID: 304
File ID: 305
File ID: 306
File ID: 307
File ID: 308
File ID: 309
File ID: 310
File ID: 311
File ID: 312
File ID: 313
File ID: 314
File ID: 315
File ID: 316
File ID: 317
File ID: 318
File ID: 319
File ID: 320
File ID: 321
File ID: 322
File ID: 323
File ID: 324
File ID: 325
File ID: 326
File ID: 327
File ID: 328
File ID: 329
File ID: 330
File ID: 331
File ID: 332
File ID: 333
File ID: 334
File ID: 335
File ID: 336
File ID: 337
File ID: 338
File ID: 339
File ID: 340
File ID: 341
File ID: 342
File ID: 343
File ID: 344
File ID: 345
File ID: 346
File ID: 347

  tokens['term_str'] = tokens['token_str'].str.lower().str.replace('[\W_]', '')


creating vocab
creating bag-of-words
bag is: speaker_id
creating TFIDF
tf method:  sum
idf method:  standard
saving work
Done.


In [29]:
DOC.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,line_str
speaker_id,line_num,Unnamed: 2_level_1
305,30,"Good stuff. You know, I think it's very cool t..."
203,3,"So much. It's new. All right, so let's go to o..."
344,23,Thank you.
301,39,"So. Okay, good. Uh, Mina, do you have any ways..."
319,38,This entire movie? Maybe you could share the p...
208,45,All right. So when we wanna use our materials ...
229,41,"We'll write your, write your idea down later a..."
327,45,"Mina. We'll, why don't you wait until we're do..."
341,25,"Yes, I agree. I think that's a great norm. Mm-..."
323,42,Yes. Thank you.


In [30]:
DOC.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,line_str
speaker_id,line_num,Unnamed: 2_level_1
201,1,Good morning class. I'm Ms. Tachi. How are you...
201,3,Good. That's good to hear. So today we're gonn...
201,5,"Yeah. Good. Emily, Carlos, you guys Nice. So w..."
201,7,"Okay. Um, Emily, um, after small group, we can..."
201,9,"No worries. Yeah, we can talk about it. Um, Ca..."
201,11,"Um, so Carlos wants to share and one of our ex..."
201,13,Oh yeah. That's a great idea. I I love an idea...
201,15,"Yeah, of course. Um, after small group we can ..."
201,17,Yeah. And if anyone else wants to go over scie...
201,19,"Uh, so can someone share one of the expectatio..."


In [31]:
LIB.head()

Unnamed: 0_level_0,book_file
speaker_id,Unnamed: 1_level_1
201,201_1.24.20_S_SC.txt
202,202_1.30.20_S_SC.txt
203,203_1.30.20_S_SC.txt
204,204_1.30.20_S_SC.txt
205,205_1.30.20_S_SC.txt


In [32]:
TOKEN.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str,term_id
speaker_id,line_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
201,1,0,0,"(Good, JJ)",JJ,Good,good,542
201,1,0,1,"(morning, NN)",NN,morning,morning,828
201,1,0,2,"(class, NN)",NN,class,class,231
201,1,1,0,"(I, PRP)",PRP,I,i,621
201,1,1,1,"('m, VBP)",VBP,'m,m,764
201,1,1,2,"(Ms., NNP)",NNP,Ms.,ms,838
201,1,1,3,"(Tachi, NNP)",NNP,Tachi,tachi,1298
201,1,2,0,"(How, WRB)",WRB,How,how,613
201,1,2,1,"(are, VBP)",VBP,are,are,86
201,1,2,2,"(you, PRP)",PRP,you,you,1545


In [33]:
TOKEN.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str,term_id
speaker_id,line_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
216,27,1,29,"(that, DT)",DT,that,that,1333
323,17,0,1,"(you, PRP)",PRP,you,you,1545
205,7,0,0,"(Hey, NNP)",NNP,Hey,hey,594
304,23,0,31,"(to, TO)",TO,to,to,1371
229,59,5,0,"(So, RB)",RB,So,so,1215
212,16,3,18,"(at, IN)",IN,at,at,102
346,33,6,5,"(take, VB)",VB,take,take,1301
326,9,1,0,"(Uh, NNP)",NNP,Uh,uh,1416
222,13,0,0,"(Thank, NNP)",NNP,Thank,thank,1331
223,23,4,5,"(expectation, NN)",NN,expectation,expectation,437


In [34]:
VOCAB.head(10)

Unnamed: 0_level_0,term_rank,term_str,n,num,stop,stemmer_porter,stemmer_snowball,stemmer_lancaster,pos_max,p,h,df,idf,tfidf_mean,tfidf_sum,tfidf_median,tfidf_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1479,1,we,1745,0,1,we,we,we,PRP,0.035873,0.172224,88,0.004907,0.000175,0.015432,0.000174,0.000423
1545,2,you,1585,0,1,you,you,you,PRP,0.032584,0.160954,88,0.004907,0.000162,0.014214,0.000163,0.00028
1333,3,that,1553,0,1,that,that,that,DT,0.031926,0.158644,88,0.004907,0.000157,0.013844,0.000152,0.0003
1371,4,to,1524,0,1,to,to,to,TO,0.03133,0.156533,88,0.004907,0.000152,0.013417,0.000154,0.000289
1215,5,so,998,0,1,so,so,so,RB,0.020516,0.115037,89,0.0,0.0,0.0,0.0,0.0
621,6,i,985,0,1,i,i,i,PRP,0.020249,0.113922,88,0.004907,9.9e-05,0.008718,0.000103,0.000218
1132,7,s,960,0,1,s,s,s,VBZ,0.019735,0.111762,89,0.0,0.0,0.0,0.0,0.0
62,8,and,950,0,1,and,and,and,CC,0.01953,0.110893,88,0.004907,9.6e-05,0.00844,8.6e-05,0.000222
1335,9,the,759,0,1,the,the,the,DT,0.015603,0.09365,88,0.004907,7.5e-05,0.006582,7.1e-05,0.000182
920,10,our,699,0,1,our,our,our,PRP$,0.01437,0.087954,85,0.019971,0.000294,0.025014,0.000276,0.000949


In [35]:
VOCAB.sample(10)

Unnamed: 0_level_0,term_rank,term_str,n,num,stop,stemmer_porter,stemmer_snowball,stemmer_lancaster,pos_max,p,h,df,idf,tfidf_mean,tfidf_sum,tfidf_median,tfidf_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
687,1504,jla,1,0,0,jla,jla,jla,NNP,2.1e-05,0.00032,1,1.94939,0.002829,0.002829,0.002829,0.002829
397,758,engage,3,0,0,engag,engag,eng,VB,6.2e-05,0.000862,3,1.472269,0.002644,0.007932,0.002778,0.003017
1207,526,sleep,6,0,0,sleep,sleep,sleep,NN,0.000123,0.001602,5,1.25042,0.002737,0.013684,0.002134,0.005062
623,142,idea,69,0,0,idea,idea,ide,NN,0.001418,0.013421,35,0.405322,0.001404,0.049139,0.001267,0.003943
464,221,feel,36,0,0,feel,feel,feel,VB,0.00074,0.007697,23,0.587662,0.001694,0.038962,0.001209,0.007255
534,286,glad,23,0,0,glad,glad,glad,JJ,0.000473,0.005223,18,0.694118,0.001571,0.028283,0.001425,0.004059
558,60,guys,180,0,0,guy,guy,guy,VBP,0.0037,0.029892,55,0.209027,0.001338,0.073593,0.001018,0.006307
744,151,listen,62,0,0,listen,listen,list,VB,0.001275,0.012256,40,0.34733,0.001014,0.040557,0.000814,0.002882
818,893,missing,2,0,0,miss,miss,miss,VBG,4.1e-05,0.000599,2,1.64836,0.002511,0.005023,0.002511,0.002685
1221,90,some,127,0,1,some,some,som,DT,0.002611,0.022404,61,0.16406,0.000644,0.039278,0.000457,0.003242


In [36]:
BOW.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,c,tf,tfidf
speaker_id,term_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
201,3,4,1,0.007782,3.8e-05
201,8,6,1,0.011673,0.000293
201,16,5,1,0.009728,0.008833
201,21,1,1,0.001946,0.001008
201,30,3,1,0.005837,0.002515
201,43,4,1,0.007782,0.000318
201,51,1,1,0.001946,0.001261
201,61,1,1,0.001946,0.000615
201,62,17,1,0.033074,0.000162
201,64,1,1,0.001946,0.003793


In [37]:
BOW.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,c,tf,tfidf
speaker_id,term_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
338,1508,3,1,0.00487,0.000122
339,1489,4,1,0.006369,0.000625
319,1545,22,1,0.037479,0.000184
318,728,3,1,0.004451,0.005566
310,625,4,1,0.008677,0.001127
201,918,2,1,0.003891,0.000245
329,1342,1,1,0.002165,0.000542
221,1212,1,1,0.001912,0.003727
317,909,1,1,0.001379,0.002689
352,1225,1,1,0.001629,0.001021


In [38]:
TFIDF.head(10)

term_id,0,1,2,3,4,5,6,7,8,9,...,1544,1545,1546,1547,1548,1549,1550,1551,1552,1553
speaker_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201,0.0,0.0,0.0,3.8e-05,0.0,0.0,0.0,0.0,0.000293,0.0,...,0.0,0.000162,0.0,0.000612,0.002148,0.001936,0.0,0.0,0.0,0.0
202,0.0,0.0,0.0,2.9e-05,0.0,0.0,0.0,0.0,7.5e-05,0.0,...,0.0,0.000109,0.0,0.000374,0.0,0.0,0.0,0.0,0.0,0.0
203,0.0,0.0,0.0,6.6e-05,0.0,0.0,0.0,0.0,0.000149,0.0,...,0.0,0.000139,0.0,0.000841,0.0,0.0,0.0,0.0,0.0,0.0
204,0.0,0.0,0.0,5.5e-05,0.0,0.0,0.0,0.0,0.000283,0.0,...,0.0,0.000267,0.0,0.001062,0.0,0.0,0.0,0.0,0.0,0.0
205,0.0,0.0,0.0,4.3e-05,0.0,0.0,0.0,0.0,0.000221,0.0,...,0.0,0.000224,0.0,0.000663,0.0,0.0,0.0,0.0,0.0,0.0
206,0.0,0.0,0.0,5e-05,0.0,0.002817,0.0,0.0,0.000363,0.0,...,0.001807,0.000149,0.0,0.000636,0.0,0.0,0.0,0.0,0.0,0.0
207,0.0,0.0,0.0,4.8e-05,0.0,0.0,0.0,0.0,0.000645,0.0,...,0.0,0.000165,0.0,0.000497,0.0,0.0,0.0,0.0,0.0,0.0
208,0.0,0.0,0.0,8.2e-05,0.0,0.0,0.0,0.0,0.000379,0.0,...,0.0,8.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
209,0.003995,0.003995,0.003995,5e-05,0.0,0.0,0.0,0.0,0.000103,0.0,...,0.0,0.000121,0.003995,0.0,0.0,0.0,0.0,0.0,0.0,0.0
210,0.0,0.0,0.0,8.9e-05,0.0,0.0,0.0,0.0,0.000175,0.0,...,0.0,0.000198,0.0,8.8e-05,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
TFIDF.sample(10)

term_id,0,1,2,3,4,5,6,7,8,9,...,1544,1545,1546,1547,1548,1549,1550,1551,1552,1553
speaker_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
315,0.0,0.0,0.0,7.8e-05,0.0,0.0,0.0,0.0,0.000477,0.0,...,0.0,0.000233,0.0,0.000199,0.0,0.0,0.0,0.0,0.0,0.0
349,0.0,0.0,0.0,9e-05,0.0,0.0,0.0,0.0,4.2e-05,0.0,...,0.0,0.00018,0.0,0.00126,0.0,0.0,0.0,0.0,0.0,0.0
319,0.0,0.0,0.0,7.5e-05,0.0,0.0,0.0,0.0,0.000214,0.0,...,0.00213,0.000184,0.0,0.000214,0.0,0.0,0.0,0.0,0.0,0.0
215,0.0,0.0,0.0,8.4e-05,0.0,0.0,0.0,0.0,0.000351,0.0,...,0.0,0.000191,0.0,0.000195,0.0,0.0,0.002092,0.0,0.0,0.0
318,0.0,0.0,0.0,0.000138,0.0,0.0,0.0,0.0,0.000149,0.0,...,0.0,0.000182,0.0,0.000653,0.0,0.0,0.0,0.0,0.0,0.0
308,0.0,0.0,0.0,7e-05,0.0,0.0,0.0,0.0,0.000203,0.0,...,0.0,0.000169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
221,0.0,0.0,0.0,7.5e-05,0.0,0.0,0.0,0.001736,4.8e-05,0.0,...,0.0,0.000244,0.0,0.000722,0.0,0.0,0.0,0.0,0.0,0.0
227,0.0,0.0,0.0,2.5e-05,0.0,0.0,0.0,0.0,0.000475,0.0,...,0.0,0.000194,0.0,0.000432,0.0,0.0,0.0,0.0,0.0,0.0
359,0.0,0.0,0.0,0.000138,0.0,0.0,0.0,0.0,0.000384,0.0,...,0.0,0.0002,0.0,0.000642,0.0,0.0,0.0,0.0,0.0,0.0
314,0.0,0.0,0.0,7.2e-05,0.0,0.0,0.0,0.0,0.000274,0.0,...,0.0,0.000152,0.0,0.000229,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
TFIDF.shape

(89, 1554)