In [None]:
# you may need to install textblob (pip install textblob)
from textblob import TextBlob
import nltk  # textblob uses this

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
 
import pandas as pd
import numpy as np

import os
import itertools
import re

from IPython.display import display, display_html, HTML

In [None]:
# run once if you haven't before
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

In [None]:
# extract zip file

import zipfile
zip_ref = zipfile.ZipFile('feinstein.zip', 'r')
zip_ref.extractall(".")
zip_ref.close()

In [None]:
# read in each file
pr = {}
for filename in (os.listdir('Feinstein/')):
    try:
        with open('Feinstein/'+filename, 'r', encoding='utf-8') as df:
            pr[filename] = df.read()
    except UnicodeDecodeError:
        pass  # ignore files with encoding issues

In [None]:
len(pr)

In [None]:
df = pd.DataFrame({"filename": list(pr.keys()), "text": list(pr.values())})
df = df.set_index('filename')

In [None]:
df.head()

In [None]:
df['blob'] = df.text.map(TextBlob)

In [None]:
vectorizer = CountVectorizer(stop_words=stopwords.words("english"))
print( vectorizer.fit_transform(df.text).todense() )
print( vectorizer.vocabulary_ )

In [None]:
X = vectorizer.fit_transform(df.text)

In [None]:
X[0]

In [None]:
print(vectorizer.vocabulary_.get(u'education'))
print(vectorizer.vocabulary_.get(u'schools'))
print(vectorizer.vocabulary_.get(u'teach'))

In [None]:
education = ['education','schools','teach']
X[:,[vectorizer.vocabulary_.get(x) for x in education]].todense()

In [None]:
educ_topic = pd.DataFrame(X[:,[vectorizer.vocabulary_.get(x) for x in education]].todense(), 
                         columns=education)
educ_topic['filename'] = list(pr.keys())
educ_topic = educ_topic.set_index('filename')

In [None]:
educ_topic.shape

In [None]:
educ_topic.head()

In [None]:
educ_topic.education.sum()

In [None]:
(educ_topic > 0).sum()

In [None]:
educ_binary = (educ_topic > 0).astype('int')

In [None]:
educ_binary.index[educ_binary.sum(axis=1) > 0]

In [None]:
def highlight(text, terms):
    pattern = r'('+'|'.join(terms)+r')'
    text = re.sub(pattern, r'<mark>\1</mark>', text, flags=re.IGNORECASE)
    text = re.sub(r'\n', '<br><br>', text)
    text = text.replace('$', '\\$')
    return(HTML(text))

In [None]:
display(highlight(pr['8Dec2006Feinstein451.txt'], educ_binary.columns))

In [None]:
pd.crosstab(educ_binary.education, educ_binary.schools)

In [None]:
for a, b in itertools.combinations(educ_binary.columns, 2):
    print(pd.crosstab(educ_binary[a], educ_binary[b], margins=True))
    print()

In [None]:
#https://buhrmann.github.io/tfidf-analysis.html
features = vectorizer.get_feature_names()

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
Xtf = tfidf_transformer.fit_transform(X)

In [None]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [None]:
# top words for a specific document
top_feats_in_doc(Xtf, features, 0, 10)

In [None]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [None]:
df['row'] = np.arange(len(df))

In [None]:
# top words averaged across topical documents
top_mean_feats(Xtf, features, list(df.row[educ_binary.sum(axis=1) > 0]))

In [None]:
top_mean_feats(Xtf, features, list(df.row[educ_binary.sum(axis=1) == 0]))

In [None]:
# for more complicated terms, you can use regex
import re

In [None]:
df['regex_topic'] = df.text.str.contains(r'|'.join(education), flags=re.IGNORECASE)
df.regex_topic.sum()

In [None]:
df['word_topic'] = educ_binary.sum(axis=1) > 0
df.word_topic.sum()

In [None]:
# enforce word boundaries
df['regex_topic'] = df.text.str.contains(r'\b'+r'\b|\b'.join(education)+r'\b', flags=re.IGNORECASE)
df.regex_topic.sum()

In [None]:
# partial terms
df['regex_topic'] = df.text.str.contains(r'|'.join(['school','educ','teach']), flags=re.IGNORECASE)
df.regex_topic.sum()