# Wordcount

Note: The Oathkeeper data set is ~250,000 lines. Reading it takes a couple minutes, so be patient. Processing the whole dataset takes a lot longer, so this example uses only a subset of the datafile to test the analysis flow.


In [1]:
import pandas as pd

In [2]:
# prepare cleanup function
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [3]:
# read data
df = pd.read_pickle("nationalOathKeepers")

In [4]:
# clean up
df['post_clean'] = [clean(doc).split() for doc in df.post_content]

In [5]:
# Print top words in cleaned-up input data
import collections
ncount = 500
collections.Counter(" ".join(   df["post_content"]   ).split()).most_common(ncount)

[('the', 1582989),
 ('to', 1006634),
 ('and', 842466),
 ('of', 818602),
 ('a', 641531),
 ('in', 478128),
 ('is', 472041),
 ('I', 464949),
 ('that', 405641),
 ('for', 311807),
 ('Reply', 253953),
 ('on', 247566),
 ('Re:', 245411),
 ('be', 237161),
 ('are', 235115),
 ('it', 232941),
 ('have', 228500),
 ('you', 227037),
 ('by', 215665),
 ('not', 204554),
 ('with', 202607),
 ('this', 185492),
 ('as', 185019),
 ('we', 164909),
 ('The', 161592),
 ('or', 156052),
 ('will', 155513),
 ('they', 155421),
 ('was', 150820),
 ('from', 137370),
 ('our', 136107),
 ('at', 135493),
 ('my', 120143),
 ('all', 116001),
 ('but', 113599),
 ('their', 108424),
 ('an', 104104),
 ('what', 103805),
 ('would', 101001),
 ('who', 100285),
 ('your', 96304),
 ('do', 95367),
 ('has', 94495),
 ('can', 86762),
 ('he', 82433),
 ('if', 80122),
 ('about', 78623),
 ('out', 78091),
 ('one', 74884),
 ('people', 71751),
 ('just', 71436),
 ('his', 70118),
 ('up', 68966),
 ('Posted', 68717),
 ('Originally', 67992),
 ('so', 66949)