In [215]:
import numpy as np
import tensorflow as tf
import pandas as pd
import nltk
nltk.download('punkt')
import sklearn as sk

WORD_VECTOR_PATH = "data/wordVectors.txt"
VOCAB_PATH = "data/vocab.txt"
DATA_PATH = "data/primary_debates.csv"

[nltk_data] Downloading package punkt to /Users/ezshen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [216]:
def load_and_preprocess_data(in_file):
    df = pd.read_csv(in_file, quotechar='"', delimiter=",")

    # filter out bad speakers
    df = df[(df.Speaker != 'AUDIENCE') & (df.Speaker != 'OTHER') & (df.Speaker != 'CANDIDATES') & (df.Speaker != 'QUESTION')]
    
    # split text into sentences
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    df = pd.concat([pd.Series(row.Party, tokenizer.tokenize(row.Text)) for _, row in df.iterrows()]).reset_index()
    df = df.rename(index=str, columns={'index': 'Text', 0: 'Party'})
    
    for i, row in df.iterrows():
        if row.Text[-1] == '.': # get rid of periods and make lowercase
            row.Text = row.Text.lower()[:-1]
        else: 
            row.Text = row.Text.lower()
            
        if row.Party == 'Republican': # Democratic = 0, Republican = 1
            row.Party = 1 
        else: row.Party = 0 

    return df.Text.values.astype(str), df.Party.values.astype(int)

In [217]:
x, y = load_and_preprocess_data(DATA_PATH)


In [220]:
from string import punctuation
all_text = ''.join([c for c in x if c not in punctuation])
text = all_text.split('\n')

all_text = ' '.join(text)
words = all_text.split()
len(words)

468030

In [225]:
from collections import Counter
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

vocab_ints = []
for each in text:
    vocab_ints.append([vocab_to_int[word] for word in each.split()])
len(vocab_ints[0])

468030

In [228]:
sent_lens = Counter([len(x) for x in vocab_ints])
print sent_lens
print("Zero-length reviews: {}".format(sent_lens[0]))
print("Maximum review length: {}".format(max(sent_lens)))

Counter({468030: 1})
Zero-length reviews: 0
Maximum review length: 468030
