# Playing around with text analysis

Playing around with loops, regex, basic pandas/numpy functions, tokenization

In [48]:
#download June 26 dem debate youtube transcript
text = open("June 26 Dem debate full text youtube2.txt").read()

text

'LESTER HOLT:\nGood evening, everyone. I\'m Lester Holt, and welcome to the first Democratic debate to the 2020 race for president.\nSAVANNAH GUTHRIE:\nHi, I\'m Savannah Guthrie. And tonight, it\'s our first chance to see these candidates go head to head on stage together.\nWe\'ll be joined in our questioning time by our colleagues, Jose Diaz-Balart, Chuck Todd, and Rachel Maddow.\nHOLT:\nVoters are trying to nail down where the candidates stand on the issues, what sets them apart, and which of these presidential hopefuls has what it takes.\nGUTHRIE:\nWell, now it\'s time to find out.\nANNOUNCER:\nTonight, round one. New Jersey Senator Cory Booker. Former Housing Secretary Julian Castro. New York City Mayor Bill De Blasio. Former Maryland Congressman John Delaney. Hawaii Congresswoman Tulsi Gabbard. Washington Governor Jay Inslee. Minnesota Senator Amy Klobuchar. Former Texas Congressman Beto O\'Rourke. Ohio Congressman Tim Ryan. And Massachusetts Senator Elizabeth Warren.\nFrom NBC Ne

In [49]:
#As you can see there is a line break '\n'. Let's remove these
text = text.replace("\n"," ")

In [50]:
#import our tools
import re
import numpy as np
import pandas as pd
"""
Using regex's flexible rules, looks for capitalised named, regardless of whether
the name is hypenated (Diaz-Balart), or is a  single word (Warren) or double word (Elizabeth Warren),
or more (Bill De Blasio), or includes an special symbol (O'Rourke) 
- although this last filter doesn't seem to work
"""
pattern = re.compile(r"[\(]?([A-Z]?\w+\s?[A-Z]?\w+\s?-?'?[A-Z]\w+)[\)]?:")
matches = pattern.finditer(text)

In [51]:
#Set empty arrays to the name of the speaker and the text position when they start talking

name = []
pos = []
posnum = []
for match in matches:
    name.append(match.group(1))
    pos.append(match.span())
    start,end = match.span()
    
pos = pd.DataFrame(pos)
name = pd.DataFrame(name)
text[pos[1][0]:pos[0][1]]
text[pos[1][1]:pos[0][2]]
pos[1][2]
pos[0][3]

536

In [52]:
pos.head()


Unnamed: 0,0,1
0,0,12
1,129,146
2,375,380
3,536,544
4,578,588


In [53]:
""" This part is not pretty. Using the previous positions, a new array will
identifies the text position where the speaker starts/stops speaking 
"""
pos['speech_loc_start'] = pos[1]+1
pos['speech_loc_end'] = pos[0].shift(-1)+1
pos['speech_loc_end'].iloc[-1] = len(text)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [54]:
pos.head()

Unnamed: 0,0,1,speech_loc_start,speech_loc_end
0,0,12,13,130.0
1,129,146,147,376.0
2,375,380,381,537.0
3,536,544,545,579.0
4,578,588,589,1109.0


In [55]:
speech = []

#Using the positions, saves speeches in the order spoken to the speech df
for index, row in pos.iterrows():
     speech.append(text[int(row['speech_loc_start']):int(row['speech_loc_end'])])
speech = pd.DataFrame(speech)

In [56]:
"""Create new df called transcript
The index indicates order of the conversation, name indicates speaker, 
speech indicates words spoken during that segemment
"""
speech.columns = ['speech']
name.columns = ['name']
transcript = name['name']
transcript = pd.concat([name,speech],axis=1)

transcript.head()

Unnamed: 0,name,speech
0,LESTER HOLT,"Good evening, everyone. I'm Lester Holt, and w..."
1,SAVANNAH GUTHRIE,"Hi, I'm Savannah Guthrie. And tonight, it's ou..."
2,HOLT,Voters are trying to nail down where the candi...
3,GUTHRIE,"Well, now it's time to find out. A"
4,ANNOUNCER,"Tonight, round one. New Jersey Senator Cory Bo..."


In [57]:
# Let's take a look at who spoke the most
speak_count = transcript['name'].value_counts()
speak_count

TODD                72
HOLT                40
MADDOW              36
GUTHRIE             31
DELANEY             27
DIAZ-BALART         26
ROURKE              25
CASTRO              21
RYAN                21
BOOKER              17
WARREN              15
GABBARD             14
KLOBUCHAR           13
DE BLASIO           12
INSLEE               8
UNKNOWN              6
ELIZABETH WARREN     1
CORY BOOKER          1
AMY KLOBUCHAR        1
BILL DE BLASIO       1
ANNOUNCER            1
BETO O'ROURKE        1
TULSI GABBARD        1
JAY INSLEE           1
LESTER HOLT          1
JULIÁN CASTRO        1
JOHN DELANEY         1
SAVANNAH GUTHRIE     1
Name: name, dtype: int64

In [58]:
"""
The way the transcript is written, the first time a candidate is introduces, their full names are used;
last names are use subsequently. Let's clean this up
"""
speak_count = pd.DataFrame(speak_count.reset_index())
speak_count.columns = ['name','count']

In [59]:
#cleans up names by setting all names to last name
transcript['name_clean'] = transcript['name']

for index, row in transcript.iterrows():
    if len(transcript['name'][index].split()) == 2:
        transcript['name_clean'].iloc[index] = transcript['name'].iloc[index].split()[1]
    if len(transcript['name'][index].split()) == 3:
        transcript['name_clean'].iloc[index] = transcript['name'].iloc[index].split()[1:2]

In [60]:
cleaned = pd.Series(transcript['name_clean'])
#cleaned.value_counts()


In [61]:
for name in transcript['name']:
    print(name.split())

['LESTER', 'HOLT']
['SAVANNAH', 'GUTHRIE']
['HOLT']
['GUTHRIE']
['ANNOUNCER']
['HOLT']
['GUTHRIE']
['DIAZ-BALART']
['HOLT']
['GUTHRIE']
['ELIZABETH', 'WARREN']
['GUTHRIE']
['WARREN']
['GUTHRIE']
['AMY', 'KLOBUCHAR']
['GUTHRIE']
['BETO', "O'ROURKE"]
['ROURKE']
['GUTHRIE']
['ROURKE']
['GUTHRIE']
['ROURKE']
['GUTHRIE']
['ROURKE']
['GUTHRIE']
['ROURKE']
['GUTHRIE']
['CORY', 'BOOKER']
['GUTHRIE']
['BOOKER']
['GUTHRIE']
['BOOKER']
['GUTHRIE']
['GUTHRIE']
['WARREN']
['GUTHRIE']
['HOLT']
['JULIÁN', 'CASTRO']
['HOLT']
['TULSI', 'GABBARD']
['DIAZ-BALART']
['BILL', 'DE', 'BLASIO']
['DIAZ-BALART']
['JOHN', 'DELANEY']
['DIAZ-BALART']
['JAY', 'INSLEE']
['DIAZ-BALART']
['RYAN']
['DIAZ-BALART']
['RYAN']
['DIAZ-BALART']
['WARREN']
['DIAZ-BALART']
['WARREN']
['HOLT']
['KLOBUCHAR']
['HOLT']
['WARREN']
['HOLT']
['ROURKE']
['HOLT']
['ROURKE']
['HOLT']
['ROURKE']
['DE', 'BLASIO']
['ROURKE']
['DE', 'BLASIO']
['ROURKE']
['DE', 'BLASIO']
['ROURKE']
['DE', 'BLASIO']
['DELANEY']
['HOLT']
['GABBARD']
['HOLT']
['G

In [62]:
type(transcript['name'])

pandas.core.series.Series

In [63]:
transcript['name'].value_counts()

TODD                72
HOLT                40
MADDOW              36
GUTHRIE             31
DELANEY             27
DIAZ-BALART         26
ROURKE              25
CASTRO              21
RYAN                21
BOOKER              17
WARREN              15
GABBARD             14
KLOBUCHAR           13
DE BLASIO           12
INSLEE               8
UNKNOWN              6
ELIZABETH WARREN     1
CORY BOOKER          1
AMY KLOBUCHAR        1
BILL DE BLASIO       1
ANNOUNCER            1
BETO O'ROURKE        1
TULSI GABBARD        1
JAY INSLEE           1
LESTER HOLT          1
JULIÁN CASTRO        1
JOHN DELANEY         1
SAVANNAH GUTHRIE     1
Name: name, dtype: int64

In [64]:
type(transcript['name'])

pandas.core.series.Series

In [65]:
transcript['name_clean'].apply(tuple).value_counts()

(T, O, D, D)                         72
(H, O, L, T)                         41
(M, A, D, D, O, W)                   36
(G, U, T, H, R, I, E)                32
(D, E, L, A, N, E, Y)                28
(D, I, A, Z, -, B, A, L, A, R, T)    26
(R, O, U, R, K, E)                   25
(C, A, S, T, R, O)                   22
(R, Y, A, N)                         21
(B, O, O, K, E, R)                   18
(W, A, R, R, E, N)                   16
(G, A, B, B, A, R, D)                15
(K, L, O, B, U, C, H, A, R)          14
(B, L, A, S, I, O)                   12
(I, N, S, L, E, E)                    9
(U, N, K, N, O, W, N)                 6
(A, N, N, O, U, N, C, E, R)           1
(DE,)                                 1
(O, ', R, O, U, R, K, E)              1
Name: name_clean, dtype: int64

In [66]:
transcript[transcript['name_clean']=="O'ROURKE"]

Unnamed: 0,name,speech,name_clean
16,BETO O'ROURKE,This economy has got to work for everyone. And...,O'ROURKE


In [67]:
transcript['name_clean'] = transcript['name_clean'].replace("O'ROURKE","ROURKE")
candidate = ['ROURKE','CASTRO','RYAN','BOOKER','WARREN','GABBARD','KLOBUCHAR','BLASIO','INSLEE']

In [68]:
transcript[transcript['name_clean'] == 'WARREN']['speech']

10                         Thank you. Good to be here. G
12     So I think of it this way. Who is this economy...
34     So the way I understand this, it's there is wa...
51     So we've had an industrial policy in the Unite...
53                                     We can do this. H
57     So, yes. I'm with Bernie on Medicare for all. ...
83     ... point, though, and that is that the insura...
84       It's time for us to make families come first. I
94     I would make certain that every woman has acce...
96     We now have an America where most people suppo...
194    So, in this period of time that I have been ru...
196    What I think we need to do is we need to treat...
198                 We need to fight for our children. T
228                                              I do. T
230    I do. We are democracy. And the way a democrac...
393    Thank you. It's a great honor to be here. Neve...
Name: speech, dtype: object

In [69]:
"""for name in candidate:
    speech_dict = [transcript[transcript['name_clean'] == name]['speech']]
    print(speech_dict)
"""

"for name in candidate:\n    speech_dict = [transcript[transcript['name_clean'] == name]['speech']]\n    print(speech_dict)\n"

In [70]:
candidate = pd.DataFrame(candidate)
candidate.columns = ['Candidate Name']
candidate
' '.join(candidate['Candidate Name'].tolist())

'ROURKE CASTRO RYAN BOOKER WARREN GABBARD KLOBUCHAR BLASIO INSLEE'

In [71]:
'''
for name in candidate['Candidate Name']:
    speech_dict = [transcript[transcript['name_clean'] == name]['speech']]
    ' '.join([transcript[transcript['name_clean'] == name]['speech']].tolist())
'''

"\nfor name in candidate['Candidate Name']:\n    speech_dict = [transcript[transcript['name_clean'] == name]['speech']]\n    ' '.join([transcript[transcript['name_clean'] == name]['speech']].tolist())\n"

In [72]:
warren = transcript[transcript['name_clean'] == 'WARREN']

In [73]:
def speech_concat(name):
    
    speech = ''
    
    for rows in transcript[transcript['name_clean'] == name]['speech']:
        speech = speech + rows 

    return speech

In [74]:
candidate['Speech'] = candidate['Candidate Name'].apply(speech_concat)

In [75]:
candidate

Unnamed: 0,Candidate Name,Speech
0,ROURKE,This economy has got to work for everyone. And...
1,CASTRO,"Thank you very much for that question, Lester...."
2,RYAN,"Yes, I believe you can, but, first, let's say ..."
3,BOOKER,I don't think I disagree. I think we have a se...
4,WARREN,Thank you. Good to be here. GSo I think of it ...
5,GABBARD,"First of all, let's recognize the situation we..."
6,KLOBUCHAR,"Well, first, the economy. We know that not eve..."
7,BLASIO,"Wait, wait, wait. Congressman O'Rourke, Congre..."
8,INSLEE,"Well, I'm a little bit surprised. I think plan..."


In [76]:
def speech_analysis(name):
    speech = candidate[candidate['Candidate Name']== name]['Speech'].item().split(' ')
    print('Word spoken: ', len(speech))

    unique_words = set(speech)
    print('Unique words: ', len(unique_words))

In [77]:
for names in candidate['Candidate Name']:
    print(names)
    speech_analysis(names)
    print('')

ROURKE
Word spoken:  1585
Unique words:  695

CASTRO
Word spoken:  1646
Unique words:  689

RYAN
Word spoken:  1406
Unique words:  584

BOOKER
Word spoken:  2222
Unique words:  785

WARREN
Word spoken:  1580
Unique words:  619

GABBARD
Word spoken:  1262
Unique words:  518

KLOBUCHAR
Word spoken:  1628
Unique words:  646

BLASIO
Word spoken:  978
Unique words:  450

INSLEE
Word spoken:  886
Unique words:  373



In [78]:
Rourke = candidate[candidate['Candidate Name']== 'ROURKE']['Speech'].item().split(' ')
unique_words = set(Rourke)

word_histogram = dict.fromkeys(unique_words, 0)
for word in Rourke:
    word_histogram[word] = word_histogram[word]+ 1
    
df_histo = unique_words
for word in Rourke:
    df_hist[word] = word_histogram[word]+ 1

In [79]:
word_histogram

{'': 24,
 '$2': 1,
 '$5': 1,
 '(CROSSTALK)': 1,
 '--': 5,
 '10': 1,
 '2': 1,
 '2,000': 1,
 '2.3': 1,
 '2016,': 1,
 '254': 1,
 '27': 1,
 '28': 1,
 '40.': 1,
 'A': 2,
 'Act': 1,
 'America': 1,
 'America,': 1,
 'American': 1,
 'Americans': 1,
 'And': 8,
 'As': 2,
 'Bree.': 1,
 'But': 3,
 'C...': 1,
 'CAnd': 1,
 'CBut': 1,
 "CHere's": 1,
 'CIf': 1,
 'CWe': 1,
 'Celsius,': 1,
 'Central': 1,
 'Congress,': 1,
 'D...': 2,
 'DLet': 1,
 'DOur': 1,
 "DThat's": 1,
 'DTonight': 1,
 'DWe': 1,
 'Democrats': 1,
 'Douglas': 1,
 'Dreamers': 1,
 'Fe': 1,
 'Florida,': 1,
 'For': 1,
 'G...': 2,
 'GI': 1,
 'GMy': 1,
 "GThat's": 1,
 'HNo.': 1,
 'HWe': 2,
 'Hart,': 1,
 "He's": 2,
 'Her': 1,
 'Houston,': 1,
 'I': 12,
 "I'll": 1,
 "I'm": 1,
 'If': 2,
 'In': 4,
 'Iowa.': 1,
 'Iran': 1,
 "It's": 1,
 'Junction': 1,
 'Junction,': 1,
 'Korea': 1,
 'Laredo,': 1,
 'Many': 1,
 'Marcel,': 1,
 'Marjory': 1,
 'Medicare.': 3,
 'Miami,': 1,
 'Missouri': 1,
 'North': 1,
 "O'RRight": 1,
 'Oscar.': 1,
 'Our': 1,
 'PACs,': 1,
 

In [80]:
word_histogram2 = {k: v for k, v in word_histogram.items() if v > 3}

In [81]:
import plotly
from plotly.offline import iplot, init_notebook_mode
from plotly import tools
import plotly.graph_objs as go
init_notebook_mode(connected=True)
 
trace = {'type': 'bar', 'x': list(unique_words), 'y': list(word_histogram2.values())}
 
plotly.offline.iplot({'data': [trace]})

In [82]:
df_hist = ''
df_hist = pd.DataFrame.from_dict(word_histogram,orient='index')
df_hist.columns=['freq']
df_hist

Unnamed: 0,freq
,24
universal,2
chance,1
people,3
2,1
my,1
doesn't,1
achieving,1
victim,1
"insured,",1


In [95]:
# print(listofTuples)

In [90]:
# dict((x, y) for x, y in listofTuples)

In [91]:
# dfObj = pd.DataFrame(listofTuples, columns=['word', 'freq'])

In [92]:
# dfObj = dfObj.head(50)

In [93]:
# trace = {'type': 'bar', 'x': dfObj['word'], 'y': dfObj['freq']}
 
# plotly.offline.iplot({'data': [trace]})

In [94]:
# articles = ['a','an','and','the']

**Another way of tokenization**

In [None]:
import nltk
nltk.download()
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
example_sent = "This is a sample sentence, showing off the stop words filtration."
  
stop_words = set(stopwords.words('english')) 
  
word_tokens = word_tokenize(example_sent) 
  
filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  
filtered_sentence = [] 
  
for w in word_tokens: 
    if w not in stop_words: 
        filtered_sentence.append(w) 

print(word_tokens) 
print(filtered_sentence) 