<h1 align="center"><font color="0066FF" size=110>Babel Powered Jupyter Notebook</font></h1>


In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import matplotlib as mpln
import matplotlib.pyplot as plt

import pprint as pp
import pickle
import re

pd.options.display.max_colwidth = 1000



In [1]:
# read us data collected by craigcrawler 
usa_raw = pd.read_csv("data/us.csv", index_col=0)
post_count_total_raw = len(usa_raw)
post_count_by_state_raw = usa_raw.groupby("state").count()["title"]#.sort_values(ascending=False)
post_count_by_region_raw = usa_raw.groupby("region").count()["title"]#.sort_values(ascending=False)

print ("\n{0:,} total posts exctracted from {3:,} regions over {4} "+ 
       "state. The most popular\nstate was {1}, and the most " + 
       "popular region was, surprisingly, {2}.").format(post_count_total_raw,
                                                        post_count_by_state_raw.index[0],
                                                        post_count_by_region_raw.index[0],
                                                        len(post_count_by_region_raw),
                                                        len(post_count_by_state_raw))




In [1]:
# Keys for geography stuff. Table is an index table.
# These keys are used as index for census table.
GEO_NAME = "GEO.display-label"
GEO_KEY = "GEO.id"
state_keys = pd.read_csv("data/census/DEC_10_DP_G001_with_ann.csv")[1:].set_index(GEO_KEY)

state_keys = state_keys.filter([GEO_NAME])[:52]
state_keys = state_keys[state_keys[GEO_NAME]!= "Puerto Rico"]



In [1]:
  # keys for the census data. Only really care about two of them (there are hundreds):
  TOT_NUM_ID = "HD01_S001" # total number key
  TOT_PER_ID = "HD02_S001" # total percent key

  census = pd.read_csv("data/census/DEC_10_DP_DPDP1_with_ann.csv")[1:].set_index(GEO_KEY)

  census = census.filter([TOT_NUM_ID])
  census = census.join(state_keys, how="right")
  census.columns = ["population", "state"]
  census.set_index("state", inplace=True)
    
  def correct_stat(s):
      """
      Some states have extra information for population. 
      Example: 25145561(r48514)
      """
      loc = s.find("(")
      return int(s[:loc] if loc > 0 else s)

  census.population = census.population.apply(correct_stat)
  
  census = census.drop("District of Columbia")



In [1]:
  import requests
  from scrapy import Selector

  atlas_url = "http://uselectionatlas.org/RESULTS/data.php?year=2016&datatype=national&def=1&f=1&off=0&elect=0"
  atlas_source = requests.get(atlas_url).text
  select = Selector(text=atlas_source).xpath('//*[@id="datatable"]/tbody/tr')

  convert = lambda s: int(s.replace(',', ''))
  vote_names = map(str, select.xpath('td[3]/a/text()').extract())
  # Correct name for DC
  vote_names[8] = "District of Columbia"
  clinton_votes = map(convert, select.xpath('td[17]/text()').extract())
  trump_votes = map(convert, select.xpath('td[18]/text()').extract())

  gen_votes = pd.DataFrame({"clinton": clinton_votes, "trump": trump_votes}, index=vote_names)

  trump_favor = pd.DataFrame(gen_votes["trump"]/gen_votes.sum(axis=1), columns=["trumpism"], index=vote_names)  
  voting = gen_votes.join(trump_favor).sort_values("trumpism", ascending=False)  
  voting = voting.drop("District of Columbia")

  # for pretty printing
  voting_space = pd.DataFrame([["------", "------", "------"]],index=["*SPACE*"], columns=voting.columns) 
  pd.concat([voting[:5], voting_space, voting[-5:].sort_values("trumpism")])



In [1]:
#+END_SRC

#+RESULTS:



In [1]:
  print "Data tests... \n\nAssertions Passed\n\n"

  # Confirm all expected regions and states present
  assert len(usa_raw["state"].unique()) == 52 # expected number of states
  assert len(usa_raw["region"].unique()) == 416  # expected number of regions
 
  # Confirm that there are no posts without regions/states. Not all CL 
  # regions have subregions, so it's okay for null subregions.
  assert len(usa_raw[usa_raw["state"].isnull()].index) == 0
  assert len(usa_raw[usa_raw["region"].isnull()].index) == 0

  # Find regions/subregions for which there are no posts
  postless_regions = usa_raw[usa_raw["title"].isnull()]  
  postless_regions_times = usa_raw[usa_raw["date"].isnull()]

  # not actually an effective test, but good enough
  assert len(postless_regions) == len(postless_regions_times)

  print(("{0:,} regions/subregions over {1} states without " + 
         "any posts.").format(len(postless_regions), postless_regions["state"].nunique()))  



In [1]:
# Drop empty regions.
usa = usa_raw.dropna(subset=["title", "date"], how="any", axis=0)
assert len(postless_regions) == len(usa_raw)-len(usa)

# Get rid of territories (Guam, Puerto Rico)
usa = usa[usa["state"] != "Territories"]
usa = usa[usa["state"] != "District of Columbia"]



In [1]:
assert set(usa.state.unique()) == set(census.index) and len(usa.state.unique() == len(census.index))

print "Census data complete"



In [1]:
assert set(usa.state.unique()) == set(voting.index) and len(usa.state.unique() == len(voting.index))

print "Voting data complete"



In [1]:
  patronage = pd.DataFrame(usa.groupby('state').size(), columns=["patronage"]).sort_values(
      "patronage",ascending=False)

  print "Top ten most frequented states:\n{}".format(patronage[:10])



In [1]:
    cl_by_state = patronage.join(census, how="inner")
    usage = cl_by_state.apply(
        lambda df: df["patronage"] / float(df["population"]), axis=1)

    # Weight for max = 1.000
    usage_weighted = (usage - usage.min())/(usage.max() - usage.min())
    weighted_usage = pd.DataFrame((usage_weighted),
                                   columns=["usage"])

    state_usage = pd.concat([cl_by_state, weighted_usage],
                            axis=1).sort_values("usage",
                                                ascending=False)




In [1]:
  state_usage_space = pd.DataFrame([["------", "------", "------"]],index=["*SPACE*"],
                                   columns=state_usage.columns)

  pd.concat([state_usage[:5], state_usage_space, state_usage[-5:].sort_values("usage")])



In [1]:
x = np.arange(len(pat))

plt.bar(x, pat.population)



In [1]:
pat = state_usage.sort_values("patronage", ascending=True)
x = np.arange(len(pat))

ax = plt.subplot(111)  
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)  
    
ax.get_xaxis().tick_bottom()  
ax.get_yaxis().tick_left()  

plt.xlabel("Usage", fontsize=16)  
plt.ylabel("States", fontsize=16)      

plt.hist(states.usage
         color="#3F5D7D", bins=15)  



In [1]:
plt.bar(x, pat.sort_values("population").usage)

fig = plt.figure() # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
ax2 = ax.twinx() # Create another axes that shares the same x-axis as ax.

width = 0.4

pat.population.plot(kind='bar', color='red', ax=ax, width=width, position=1)
pat.patronage.plot(kind='bar', color='blue', ax=ax2, width=width, position=0)

ax.set_ylabel('population')
ax2.set_ylabel('usage')

ax = pat.plot(kind="bar")
ax2 = ax.twinx()
for r in ax.patches[len(pat):]:
    r.set_transform(ax2.transData)
ax2.set_ylim(0, 2);






In [1]:
plt.plot(x, state_usage.population.sort_values().values)



In [1]:
# Getting rid of California
p1 = state_usage.sort_values("population", ascending=False)[5:]

plt.bar(p1["population"], p1["usage"])



In [1]:
post_politics = usa.join(voting, on="state").join(find_strs("trump"), how="inner")



In [1]:
states = state_usage.join(voting, how="left").sort_values("usage")[:50]
plt.hist([states.usage, states.trumpism], bins=30)



In [1]:
print states.filter(["patronage", "usage", "normalized", "trumpism"]).corr()



In [1]:
pop_english_words = ["the", "re", "a", "s", "t", "i", "of", "to", "and", "and", "in", "is", "it", "you", "that", "he", "was", "for", "on", "are", "with", "as", "I", "his", "they", "be", "at", "one", "have", "this", "from", "or", "had", "by", "hot", "but", "some", "what", "there", "we", "can", "out", "other", "were", "all", "your", "shit", "when", "up", "use", "word", "how", "said", "an", "each", "she", "which", "do", "their", "time", "if", "will", "way", "about", "many", "fuck", "then", "them", "would", "write", "like", "so", "these", "her", "long", "make", "thing", "see", "him", "two", "has", "look", "more", "day", "could", "go", "come", "did", "my", "sound", "no", "most", "number", "who", "over", "know", "water", "than", "call", "first", "people", "may", "down", "side", "been", "now", "find"]



In [1]:
  def post_words(df, no_pop=False):
      words = re.findall(r'\w+', df.title.apply(lambda x: x + " ").sum())
      if no_pop:
          # pop_english_words is a list of the most popular (and boring) English
          # words. E.g., "and", "to", "the", etc.
          words = [word for word in words if word not in pop_english_words]
      return  words

  def words(df=usa, no_pop=False):
      # word counts across all posts
      words = post_words(df, no_pop)
      word_counts = Counter([word.lower() for word in words])
      wcs = zip(*[[word, count] for word, count in word_counts.iteritems()])

      corpus = pd.Series(wcs[1], index=wcs[0]).rename("counts")

      return corpus.sort_values(ascending=False)



In [1]:
  def find_strs(substr, df=usa):
      """
      Get all titles from usa that have substr in their post title. Add some data on capitalization.
      """
      
      find = lambda s: (1 if re.search(substr, s, re.IGNORECASE) else np.nan)

      return df.title[df.title.map(find) == 1].rename("*" + substr + "*", inplace=True)

  def categ_strs(findings):
      """
      Return a list of 
      """
      s = findings.name[1:-1]
      find = lambda sub, string: (1 if re.search(sub, string) else np.nan)

      proper = findings.apply(lambda x: find(s[0].upper() + s[1:].lower(), x)).rename("proper")
      cap = findings.apply(lambda x: find(s.upper(), x)).rename("uppercase")
      low = findings.apply(lambda x: find(s.lower(), x)).rename("lower")

      return pd.concat([proper, cap, low], axis=1)

  def eval_strs(string, df=usa):
      findings = find_strs(string, df)
      return categ_strs(findings).join(findings)





In [1]:
  lib_words = words(df=post_politics[post_politics.trumpism < .45], no_pop=True).rename("libs")
  conserv_words = words(df=post_politics[post_politics.trumpism > .55], no_pop=True).rename("conservs")  



In [1]:
  rat = lambda df: df.libs/df.conservs
  ratio = pd.DataFrame().join([lib_words[lib_words >= 10], conserv_words[conserv_words >= 10]],
                                      how="outer").apply(rat, axis=1).dropna()
  ratio = ratio.rename("dem/rep ratio")
  lib_con_ratio = pd.DataFrame(posts_corpus).join(ratio.sort_values(ascending=False), how="inner")
  lib_con_ratio.sort("dem/rep ratio", ascending=False, inplace=True)
  lib_con_ratio[:10]
  #lib_con_ratio = posts_corpus.join(lib_con_ratio.sort_values(ascending=False), on="words")



In [1]:
#+END_SRC

#+RESULTS:



In [1]:
l



In [1]:
trumps = eval_strs("trump").join(usa.state, how="inner")
trumps_by_state = trumps.groupby("state").count().join(states).drop(["clinton", "trump"], axis=1)
up_over_trumps = (trumps_by_state.uppercase/trumps_by_state["*trump*"]).rename("uppercase usage")
prop_over_trumps = (trumps_by_state.proper/trumps_by_state["*trump*"]).rename("propercase usage")
trumps_over_pat = (trumps_by_state["*trump*"]/trumps_by_state.patronage).rename("trumps usage")
trumps_by_state = trumps_by_state.join([prop_over_trumps, up_over_trumps, trumps_over_pat], how="outer")



In [1]:
trumps_vs_trumpism = trumps_by_state.filter(["trumpism", "propercase usage", "uppercase usage", "trumps usage"]).sort_values("trumps usage", ascending=True)[1:]

pd.DataFrame.hist(trumps_vs_trumpism, bins=50)
#plt.hist([prop_over_cap.trumpism, prop_over_cap[""]], bins=30)



In [1]:
trump_posts = usa.join(voting, on="state").join(find_strs("trump"), how="outer")

print "Selecting states that are espectially anti-trump:\n{0}".format(t[t.trumpism < .4].title.sample(10))

print "\nPolitically liberal states composing the above sampling:\n{0}".format(t[t.trumpism < .4].groupby("state").sum().index.tolist())



In [1]:
from os import path
from PIL import Image

from wordcloud import WordCloud, STOPWORDS

d = path.dirname("/home/dodge/workspace/craig-politics/")

trump_mask = np.array(Image.open(path.join(d, "Trump_silhouette.png")))

stopwords = set(STOPWORDS)

wc = WordCloud(background_color="white", max_words=2000, mask=alice_mask,
               stopwords=stopwords)


# generate word cloud
wc.generate(posts_sum)

# save to file
wc.to_file(path.join(d, "Trump_test.png"))

# show
plt.imshow(wc)
plt.axis("off")
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray)
plt.axis("off")
plt.show()



In [1]:
def check_ascii(post):
    """
    Determines whether a title is encodable as ascii
    """
    try:
        post.encode('ascii')
        return True
    except UnicodeError:
        return False

ascii_titles_tv = usa.title.apply(check_ascii)
ascii_posts = usa[ascii_titles_tv]
nonascii_posts = usa[~ascii_titles_tv]

distinct_states = nonascii_posts["state"].unique()
print ("{0:,} of {1:,} total posts were non-ascii ({2:.2f}%), confined to {3} "
       + "states.").format(len(nonascii_posts),
                       len(usa),
                       len(nonascii_posts)/float(len(usa)) * 100,
                       len(distinct_states))



In [1]:
nonascii_states_count = nonascii_posts.groupby(
    "state").title.nunique().sort_values(ascending=False)
print "\nTop ten most popular unicode states:"
print nonascii_states_count[:10]

pennsylvania = nonascii_posts[nonascii_posts["state"] == "Pennsylvania"]
print pennsylvania["title"].tolist()[0]

print("\nA single Trump memester seems to be responsible for the chaos " +
      "in Pennsylvania.\n" + "I suspect that these crazy unicode posts " +
      "are mostly done by a very small\nset of people, though there is " +
      "no way to tell.")
print "\nRandom sample of 5 non-ascii Pennsylvania posts"
print pennsylvania["title"][:5]

pennsylvania.groupby("region").count()

post_uniqueness = pennsylvania.title.nunique()/float(len(pennsylvania.title))



In [1]:
print "\n\n{0} regions in Colorado".format(usa[usa['state'] == "Colorado"]["region"].nunique())



In [1]:
posts = usa.groupby("state")["title"].agg(sum)["Kansas"]



In [1]:
  from textblob import TextBlob

  def semants(text):
      blob = TextBlob(text)
      ss = 0
      for sentence in blob.sentences:
          ss += sentence.sentiment.polarity

      return float(ss)/len(blob.sentences)



In [1]:
total_semants = usa.join(semantics, how="outer").groupby("state").mean().join(voting).sort_values("semants").corr()



In [1]:
  pat = state_usage.sort_values("patronage", ascending=True)
  x = np.arange(len(pat))
    
    
  ax = plt.subplot(111)  
  ax.spines["top"].set_visible(False)  
  ax.spines["right"].set_visible(False)  
    
  ax.get_xaxis().tick_bottom()  
  ax.get_yaxis().tick_left()  
    
  plt.xticks(fontsize=14)  
  plt.yticks(range(5000, 30001, 5000), fontsize=14)  
    
  plt.xlabel("Patronage", fontsize=16)  
  plt.ylabel("Count", fontsize=16)  
    
  plt.text(1300, -5000, "Data source: www.ChessGames.com | "  
           "Author: Randy Olson (randalolson.com / @randal_olson)", fontsize=10)  
    
  # # Finally, save the figure as a PNG.  
  # # You can also save it as a PDF, JPEG, etc.  
  # # Just change the file extension in this call.  
  # # bbox_inches="tight" removes all the extra whitespace on the edges of your plot.  
#  plt.savefig("chess-elo-rating-distribution.png", bbox_inches="tight");  

  plt.hist(states.usage,  
           color="#3F5D7D", bins=100)  

