In [1]:
import glob, os

In [2]:
os.chdir("TXT")
files = glob.glob("*.txt")

In [3]:
import pandas as pd
import re

In [4]:
documents = []

for file in files:
    
    with open(file, "r", encoding='utf-8') as f:
        text = ''.join(line for line in f)
    
    text = text.replace('-\n', '')
    text = text.split('\nf\n')
    text = [re.sub('\s+', ' ', item).strip() for item in text]
    
    documents.append(text)

In [6]:
df = pd.DataFrame(columns=['date', 'title', 'congressman', 'speech'])

In [None]:
for count, text in enumerate(documents):

    for subject in text:
        split = re.split('(Mr?s?\. [A-Z]+\. )', subject)
    
        title = getSubject(split[0])
        date = files[count][5:-4]
        
        speaker_quote = [(date, title, split[i],split[i+1]) for i in range(1,len(split),2)]
        
        frame = pd.DataFrame(speaker_quote, columns=['date', 'title', 'congressman', 'speech'])
        #print(len(frame))
        if len(frame) != 0:
            df = pd.concat([df, frame])
        
    #print(files[count])

In [8]:
df['date'] = pd.to_datetime(df['date'])

In [9]:
df

Unnamed: 0,date,title,congressman,speech
0,1995-01-05,REPORT ON CONTRACT WITH AMERICA,Mr. BOEHNER.,"Mr. Speaker, we have 99 days left in our pledg..."
0,1995-01-05,THE NEED FOR REAL LOBBYING REFORM,Mrs. KENNELLY.,"Mr. Speaker, a famous Hartford resident once s..."
0,1995-01-05,PROMISES MADE AND KEPT,Mr. SOLOMON.,"Mr. Speaker, yesterday was a great success. It..."
0,1995-01-05,SUPPORT URGED FOR LOBBYIST GIFT BAN,Ms. DELAURO.,"Mr. Speaker, Democrats were proud to join Repu..."
0,1995-01-05,A HISTORIC DAY FOR CONGRESS,Mr. FOLEY.,"Mr. Speaker, what a proud day for a freshman f..."
1,1995-01-05,A HISTORIC DAY FOR CONGRESS,Mr. RICHARDSON.,"Mr. Speaker, yesterday belonged to the Republi..."
0,1995-01-05,TRUST AND FAITH IN CONGRESS BEING RESTORED,Mr. JONES.,"Mr. Speaker, I wish to express my thanks to th..."
1,1995-01-05,TRUST AND FAITH IN CONGRESS BEING RESTORED,Mr. EDWARDS.,"Mr. Speaker, yesterday this House passed bipar..."
0,1995-01-05,INTRODUCTION OF THE SENIOR CITIZEN,Mr. BUNNING.,"Mr. Speaker, yesterday, I, along with my colle..."
0,1995-01-05,GIVING THE GOVERNMENT BACK TO THE AMERICAN PEOPLE,Mr. TIAHRT.,"Mr. Speaker, yesterday was a triumph for the A..."


In [10]:
df_group = df.groupby([df.date.dt.year, 'congressman'])['speech'].apply(lambda x: ' '.join(x)).reset_index()
df_group['len'] = df_group['speech'].str.len()
df_group

Unnamed: 0,date,congressman,speech,len
0,1995,M. A.,"RANDALL, 000–00–0000 KAREN S. RASMUSSEN, 000–0...",70769
1,1995,M. BEASLEY.,"STATE OF NEW HAMPSHIRE, OFFICE OF THE GOVERNOR...",14906
2,1995,M. BROWNER.,,2
3,1995,M. C.,Bourlas distributed to R. Seligman et al.— Ana...,41623
4,1995,M. E.,"PETERSON, 000–00–0000 THOMAS W. PITTMAN, 000–0...",88316
5,1995,M. H.,"DAVIS, 000–00–0000 STEVEN E. DICHIARA, 000–00–...",4759
6,1995,M. HON.,BILL EMERSON OF MISSOURI IN THE HOUSE OF REPRE...,173
7,1995,M. HUFFSTUTLER.,,0
8,1995,M. I.,THE NATIONAL ENDOWMENT FOR THE ARTS AND THE NA...,153452
9,1995,M. J.,"ROBILLARD, 000–00–0000 BRIAN E. ROBINSON, 000–...",104177


###########################################################################################

Work with a smaller dataframe of two politicians as proof of concept:

In [10]:
import nltk
from operator import itemgetter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [11]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def getTokens(text):
    pattern = re.compile('([^\s\w]|_)+')
    raw = pattern.sub('', text).lower()
    
    tokens = nltk.word_tokenize(raw)    
    tokens = [w for w in tokens if not w in stop_words] 
    #tokens = [ps.stem(w) for w in tokens]
    
    return(tokens)

In [12]:
def getBigrams(tokens):
    #bigrams
    bgs = nltk.bigrams(tokens)
    fdist = nltk.FreqDist(bgs)
    
    sortBigram = sorted(fdist.items(), key = itemgetter(1), reverse = True)
    stringBigram = [' '.join(bigram[0]) for bigram in sortBigram]
    
    return(stringBigram[:25])

In [65]:
top100 = df_group.groupby('congressman')['len'].agg('sum').sort_values(ascending = False).iloc[:100].index.values

In [66]:
df_group[ df_group['congressman'].isin(top100)]

Unnamed: 0,date,congressman,speech,len
30,1995,Mr. AKAKA.,"Mr. President, today I am introducing the Emer...",667560
49,1995,Mr. BAUCUS.,"Six or seven minutes. Mr. President, I rise i...",1088691
60,1995,Mr. BIDEN.,"Mr. President, I also re member something Benj...",1698041
65,1995,Mr. BINGAMAN.,"Mr. President, I am pleased today to join the ...",1064752
74,1995,Mr. BOND.,"Mr. President, I would like to announce that t...",853473
86,1995,Mr. BROWN.,"Mr. President, I rise in strong support of the...",783048
98,1995,Mr. BYRD.,"Mr. President, the distinguished Senator from ...",2870351
106,1995,Mr. CARDIN.,"Mr. Speaker, I rise today to pay tribute to P...",184174
125,1995,Mr. COBURN.,"H.R. 393: H.R. 593: Mr. Chairman, I rise in ...",86295
127,1995,Mr. COHEN.,"I will take 5 minutes. Mr. President, first l...",1701132


In [67]:
#df2 = df_group[ (df_group['congressman'] == 'Mr. BOEHNER. ') | (df_group['congressman'] == 'Mr. REID. ')]
df2 = df_group[ df_group['congressman'].isin(top100)]

In [68]:
df2['tokens'] = df2['speech'].map(getTokens)
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,date,congressman,speech,len,tokens
30,1995,Mr. AKAKA.,"Mr. President, today I am introducing the Emer...",667560,"[mr, president, today, introducing, emergency,..."
49,1995,Mr. BAUCUS.,"Six or seven minutes. Mr. President, I rise i...",1088691,"[six, seven, minutes, mr, president, rise, sup..."
60,1995,Mr. BIDEN.,"Mr. President, I also re member something Benj...",1698041,"[mr, president, also, member, something, benja..."
65,1995,Mr. BINGAMAN.,"Mr. President, I am pleased today to join the ...",1064752,"[mr, president, pleased, today, join, distingu..."
74,1995,Mr. BOND.,"Mr. President, I would like to announce that t...",853473,"[mr, president, would, like, announce, small, ..."
86,1995,Mr. BROWN.,"Mr. President, I rise in strong support of the...",783048,"[mr, president, rise, strong, support, congres..."
98,1995,Mr. BYRD.,"Mr. President, the distinguished Senator from ...",2870351,"[mr, president, distinguished, senator, iowa, ..."
106,1995,Mr. CARDIN.,"Mr. Speaker, I rise today to pay tribute to P...",184174,"[mr, speaker, rise, today, pay, tribute, peter..."
125,1995,Mr. COBURN.,"H.R. 393: H.R. 593: Mr. Chairman, I rise in ...",86295,"[hr, 393, hr, 593, mr, chairman, rise, opposit..."
127,1995,Mr. COHEN.,"I will take 5 minutes. Mr. President, first l...",1701132,"[take, 5, minutes, mr, president, first, let, ..."


In [69]:
df2['token str'] = df2['tokens'].str.join(' ')
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,date,congressman,speech,len,tokens,token str
30,1995,Mr. AKAKA.,"Mr. President, today I am introducing the Emer...",667560,"[mr, president, today, introducing, emergency,...",mr president today introducing emergency petro...
49,1995,Mr. BAUCUS.,"Six or seven minutes. Mr. President, I rise i...",1088691,"[six, seven, minutes, mr, president, rise, sup...",six seven minutes mr president rise support co...
60,1995,Mr. BIDEN.,"Mr. President, I also re member something Benj...",1698041,"[mr, president, also, member, something, benja...",mr president also member something benjamin di...
65,1995,Mr. BINGAMAN.,"Mr. President, I am pleased today to join the ...",1064752,"[mr, president, pleased, today, join, distingu...",mr president pleased today join distinguished ...
74,1995,Mr. BOND.,"Mr. President, I would like to announce that t...",853473,"[mr, president, would, like, announce, small, ...",mr president would like announce small busines...
86,1995,Mr. BROWN.,"Mr. President, I rise in strong support of the...",783048,"[mr, president, rise, strong, support, congres...",mr president rise strong support congressional...
98,1995,Mr. BYRD.,"Mr. President, the distinguished Senator from ...",2870351,"[mr, president, distinguished, senator, iowa, ...",mr president distinguished senator iowa man fo...
106,1995,Mr. CARDIN.,"Mr. Speaker, I rise today to pay tribute to P...",184174,"[mr, speaker, rise, today, pay, tribute, peter...",mr speaker rise today pay tribute peter hammen...
125,1995,Mr. COBURN.,"H.R. 393: H.R. 593: Mr. Chairman, I rise in ...",86295,"[hr, 393, hr, 593, mr, chairman, rise, opposit...",hr 393 hr 593 mr chairman rise opposition amen...
127,1995,Mr. COHEN.,"I will take 5 minutes. Mr. President, first l...",1701132,"[take, 5, minutes, mr, president, first, let, ...",take 5 minutes mr president first let commend ...


In [70]:
df2['bigrams'] = df2['tokens'].map(getBigrams)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [71]:
bigram_list = list(set().union(*list(df2['bigrams'].values)))
bigram_list

['proposed concurrent',
 'today mr',
 'lionel hampton',
 'staff sergeant',
 'poverty rate',
 'bill 1072',
 'native women',
 'million mom',
 'purposes committee',
 'yeas nays',
 'outpatient payments',
 'customs border',
 'c n',
 'operation maintenance',
 'care reform',
 'et al',
 'california ms',
 'objects cultural',
 '2003 0753',
 'mr stark',
 '2 pm',
 'vladimir putin',
 'provided amount',
 'bill 3457',
 'independent contractor',
 'tribute sgm',
 'employing offices',
 'question resolution',
 'tribal government',
 'congress assembled',
 'presidents designee',
 'authorized hazardous',
 'dr rupps',
 'tax administration',
 'billionaire president',
 'affected unit',
 'sgm layman',
 'intermodal surface',
 'title ii',
 'senate armed',
 'packers stockyards',
 'office head',
 'black history',
 'good measure',
 'hr 3080',
 'wage insurance',
 'time gentleman',
 'indigenous peoples',
 'motor carrier',
 'might part',
 'reduced amount',
 'trump shutdown',
 'claim benefits',
 'reduce class',
 'secret

In [72]:
len(bigram_list)

7339

In [73]:
for bigram in bigram_list:
    df2[bigram] = df2['token str'].str.count(bigram)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [79]:
df2

Unnamed: 0,date,congressman,speech,len,tokens,token str,bigrams,proposed concurrent,today mr,lionel hampton,...,xi rules,state contract,3 years,department defense,collect taxes,janice rogers,description request,meat loaf,barton true,flood control
30,1995,Mr. AKAKA.,"Mr. President, today I am introducing the Emer...",667560,"[mr, president, today, introducing, emergency,...",mr president today introducing emergency petro...,"[united states, mr president, air tour, 000000...",0,0,0,...,0,0,10,7,1,0,0,0,0,0
49,1995,Mr. BAUCUS.,"Six or seven minutes. Mr. President, I rise i...",1088691,"[six, seven, minutes, mr, president, rise, sup...",six seven minutes mr president rise support co...,"[mr president, united states, presiding office...",0,3,0,...,0,0,9,15,0,0,0,0,0,3
60,1995,Mr. BIDEN.,"Mr. President, I also re member something Benj...",1698041,"[mr, president, also, member, something, benja...",mr president also member something benjamin di...,"[united states, mr president, presiding office...",0,4,0,...,0,0,23,15,2,0,0,0,0,0
65,1995,Mr. BINGAMAN.,"Mr. President, I am pleased today to join the ...",1064752,"[mr, president, pleased, today, join, distingu...",mr president pleased today join distinguished ...,"[mr president, presiding officer, new mexico, ...",0,4,1,...,0,0,15,70,0,0,0,0,0,0
74,1995,Mr. BOND.,"Mr. President, I would like to announce that t...",853473,"[mr, president, would, like, announce, small, ...",mr president would like announce small busines...,"[mr president, small business, presiding offic...",0,0,0,...,0,1,6,4,0,0,0,0,0,2
86,1995,Mr. BROWN.,"Mr. President, I rise in strong support of the...",783048,"[mr, president, rise, strong, support, congres...",mr president rise strong support congressional...,"[mr president, united states, presiding office...",0,1,0,...,0,0,6,3,0,0,0,0,0,3
98,1995,Mr. BYRD.,"Mr. President, the distinguished Senator from ...",2870351,"[mr, president, distinguished, senator, iowa, ...",mr president distinguished senator iowa man fo...,"[mr president, balanced budget, united states,...",0,5,0,...,0,0,25,65,1,0,0,0,0,0
106,1995,Mr. CARDIN.,"Mr. Speaker, I rise today to pay tribute to P...",184174,"[mr, speaker, rise, today, pay, tribute, peter...",mr speaker rise today pay tribute peter hammen...,"[mr chairman, united states, mr speaker, brig ...",0,0,0,...,0,0,1,1,0,0,0,0,0,0
125,1995,Mr. COBURN.,"H.R. 393: H.R. 593: Mr. Chairman, I rise in ...",86295,"[hr, 393, hr, 593, mr, chairman, rise, opposit...",hr 393 hr 593 mr chairman rise opposition amen...,"[mr chairman, health care, mr speaker, chairma...",0,0,0,...,0,0,1,0,0,0,0,0,0,0
127,1995,Mr. COHEN.,"I will take 5 minutes. Mr. President, first l...",1701132,"[take, 5, minutes, mr, president, first, let, ...",take 5 minutes mr president first let commend ...,"[health care, united states, fiscal year, info...",0,0,0,...,0,1,18,33,6,0,0,0,0,0


In [89]:
write = df2.reset_index(drop = True)
write = write.drop(['speech', 'len', 'len', 'tokens', 'token str', 'bigrams'], axis = 1)
write

Unnamed: 0,date,congressman,proposed concurrent,today mr,lionel hampton,staff sergeant,poverty rate,bill 1072,native women,million mom,...,xi rules,state contract,3 years,department defense,collect taxes,janice rogers,description request,meat loaf,barton true,flood control
0,1995,Mr. AKAKA.,0,0,0,0,0,0,0,0,...,0,0,10,7,1,0,0,0,0,0
1,1995,Mr. BAUCUS.,0,3,0,0,1,0,0,0,...,0,0,9,15,0,0,0,0,0,3
2,1995,Mr. BIDEN.,0,4,0,0,0,0,0,0,...,0,0,23,15,2,0,0,0,0,0
3,1995,Mr. BINGAMAN.,0,4,1,0,5,0,0,0,...,0,0,15,70,0,0,0,0,0,0
4,1995,Mr. BOND.,0,0,0,0,0,0,0,0,...,0,1,6,4,0,0,0,0,0,2
5,1995,Mr. BROWN.,0,1,0,0,0,0,0,0,...,0,0,6,3,0,0,0,0,0,3
6,1995,Mr. BYRD.,0,5,0,0,1,0,0,0,...,0,0,25,65,1,0,0,0,0,0
7,1995,Mr. CARDIN.,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
8,1995,Mr. COBURN.,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,1995,Mr. COHEN.,0,0,0,0,0,0,0,0,...,0,1,18,33,6,0,0,0,0,0


In [88]:
politicians = write.iloc[:,:2]
politicians

Unnamed: 0,date,congressman
0,1995,Mr. AKAKA.
1,1995,Mr. BAUCUS.
2,1995,Mr. BIDEN.
3,1995,Mr. BINGAMAN.
4,1995,Mr. BOND.
5,1995,Mr. BROWN.
6,1995,Mr. BYRD.
7,1995,Mr. CARDIN.
8,1995,Mr. COBURN.
9,1995,Mr. COHEN.


In [98]:
politicians[['congressman']].drop_duplicates(subset = ['congressman']).to_csv("party.csv")

In [91]:
politician_bigrams = write.iloc[:,2:]
politician_bigrams

Unnamed: 0,proposed concurrent,today mr,lionel hampton,staff sergeant,poverty rate,bill 1072,native women,million mom,purposes committee,yeas nays,...,xi rules,state contract,3 years,department defense,collect taxes,janice rogers,description request,meat loaf,barton true,flood control
0,0,0,0,0,0,0,0,0,18,0,...,0,0,10,7,1,0,0,0,0,0
1,0,3,0,0,1,0,0,0,6,13,...,0,0,9,15,0,0,0,0,0,3
2,0,4,0,0,0,0,0,0,3,7,...,0,0,23,15,2,0,0,0,0,0
3,0,4,1,0,5,0,0,0,3,11,...,0,0,15,70,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,5,24,...,0,1,6,4,0,0,0,0,0,2
5,0,1,0,0,0,0,0,0,3,20,...,0,0,6,3,0,0,0,0,0,3
6,0,5,0,0,1,0,0,0,0,40,...,0,0,25,65,1,0,0,0,0,0
7,0,0,0,0,0,0,0,0,9,0,...,0,0,1,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,8,8,...,0,1,18,33,6,0,0,0,0,0


In [93]:
# politicians.to_csv("politicians.csv")
# politician_bigrams.to_csv("politician_bigrams.csv")

##############################################################################################

Retooling with trigrams:

In [22]:
import warnings
warnings.filterwarnings('ignore')

In [23]:
import nltk
from operator import itemgetter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [24]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
pattern = re.compile('([^\s\w]|_)+')

def getTrigram(text):
    raw = pattern.sub('', text).lower()
    
    tokens = nltk.word_tokenize(raw)    
    tokens = [w for w in tokens if not w in stop_words] 
    #tokens = [ps.stem(w) for w in tokens]

    bgs = nltk.trigrams(tokens)
    fdist = nltk.FreqDist(bgs)
    
    sortTrigram = sorted(fdist.items(), key = itemgetter(1), reverse = True)
    sortTrigram = [trigram[0] for trigram in sortTrigram]
    
    return(fdist, sortTrigram[:25])

In [25]:
top100 = df_group.groupby('congressman')['len'].agg('sum').sort_values(ascending = False).iloc[:100].index.values

In [26]:
#df3 = df_group[ (df_group['congressman'] == 'Mr. BOEHNER. ') | (df_group['congressman'] == 'Mr. REID. ')]
df3 = df_group[ df_group['congressman'].isin(top100)]

df3['trigram'] = df3['speech'].map(getTrigram)
df3['top trigram'] = df3['trigram'].str[1]
df3['trigram'] = df3['trigram'].str[0]
df3

Unnamed: 0,date,congressman,speech,len,trigram,top trigram
30,1995,Mr. AKAKA.,"Mr. President, today I am introducing the Emer...",667560,"{('mr', 'president', 'today'): 9, ('president'...","[(commercial, air, tour), (comparative, risk, ..."
49,1995,Mr. BAUCUS.,"Six or seven minutes. Mr. President, I rise i...",1088691,"{('six', 'seven', 'minutes'): 1, ('seven', 'mi...","[(bill, joint, resolution), (states, local, go..."
60,1995,Mr. BIDEN.,"Mr. President, I also re member something Benj...",1698041,"{('mr', 'president', 'also'): 1, ('president',...","[(united, states, code), (congressional, recor..."
65,1995,Mr. BINGAMAN.,"Mr. President, I am pleased today to join the ...",1064752,"{('mr', 'president', 'pleased'): 2, ('presiden...","[(verdate, aug, 31), (aug, 31, 2005), (2008, j..."
74,1995,Mr. BOND.,"Mr. President, I would like to announce that t...",853473,"{('mr', 'president', 'would'): 8, ('president'...","[(ask, unanimous, consent), (po, 00000, frm), ..."
86,1995,Mr. BROWN.,"Mr. President, I rise in strong support of the...",783048,"{('mr', 'president', 'rise'): 26, ('president'...","[(ask, unanimous, consent), (without, objectio..."
98,1995,Mr. BYRD.,"Mr. President, the distinguished Senator from ...",2870351,"{('mr', 'president', 'distinguished'): 15, ('p...","[(balanced, budget, amendment), (per, year, lo..."
106,1995,Mr. CARDIN.,"Mr. Speaker, I rise today to pay tribute to P...",184174,"{('mr', 'speaker', 'rise'): 10, ('speaker', 'r...","[(000000000, united, states), (united, states,..."
125,1995,Mr. COBURN.,"H.R. 393: H.R. 593: Mr. Chairman, I rise in ...",86295,"{('hr', '393', 'hr'): 1, ('393', 'hr', '593'):...","[(mr, chairman, yield), (sexually, transmitted..."
127,1995,Mr. COHEN.,"I will take 5 minutes. Mr. President, first l...",1701132,"{('take', '5', 'minutes'): 2, ('5', 'minutes',...","[(health, care, fraud), (new, budget, authorit..."


In [27]:
trigram_list = list(set().union(*list(df3['top trigram'].values)))
trigram_list

[('time', 'bring', 'conclusion'),
 ('said', 'state', 'senate'),
 ('mr', 'president', 'presiding'),
 ('medical', 'child', 'support'),
 ('great', 'lakesseaway', 'system'),
 ('provisions', 'better', 'care'),
 ('action', 'eastern', 'montana'),
 ('mr', 'kline', 'minnesota'),
 ('today', 'express', 'support'),
 ('hr', '2466', 'rollcall'),
 ('controlled', 'chair', 'ranking'),
 ('tea', '21', 'air'),
 ('members', 'committee', 'enjoy'),
 ('information', 'technology', 'training'),
 ('bipartisan', 'reform', 'act'),
 ('wanted', 'come', 'back'),
 ('mr', 'speaker', 'came'),
 ('authorized', 'hazardous', 'fuels'),
 ('said', 'going', 'shortage'),
 ('speaker', 'today', 'rise'),
 ('new', 'york', 'giants'),
 ('ith', 'ig', 'e'),
 ('act', 'grant', 'additional'),
 ('2nd', 'infantry', 'division'),
 ('illinois', 'mr', 'kirk'),
 ('enzi', 'concurrent', 'resolution'),
 ('packers', 'stockyards', 'act'),
 ('services', 'independent', 'consultant'),
 ('yield', 'gentleman', 'oklahoma'),
 ('proposed', 'constitutional', '

In [28]:
len(trigram_list)

8717

In [29]:
def getCount(series, trigram):
    try:
        return(series[trigram])
    except:
        return(0)

In [None]:
for count, trigram in enumerate(trigram_list):
    colname = ' '.join(trigram)
    df3[colname] = df3['trigram'].apply(getCount, trigram = trigram)
    #print(count)

In [31]:
write = df3.reset_index(drop = True)
write = write.drop(['speech', 'len', 'trigram', 'top trigram'], axis = 1)

In [32]:
politicians_trigram = write.iloc[:,:2]
politician_trigrams_count = write.iloc[:,2:]

In [33]:
# politicians_trigram.to_csv("politicians_trigram.csv")
# politician_trigrams_count.to_csv("politician_trigrams_count.csv")

In [None]:
df3