In [679]:
%matplotlib inline
import matplotlib as mtl
import numpy as np
import scipy
from scipy import stats
import pandas as pd

import pprint as pp
import pickle
import re

pd.options.display.max_colwidth = 1000

In [678]:
# read us data collected by craigcrawler
usa_raw = pd.read_csv("data/us.csv", index_col=0)
post_count_total_raw = len(usa_raw)
post_count_by_state_raw = usa_raw.groupby("state").count()["title"].sort_values(ascending=False)
post_count_by_region_raw = usa_raw.groupby("region").count()["title"].sort_values(ascending=False)

print ("\n{0:,} total posts exctracted from {3:,} regions over {4} "+ 
       "state. The most popular\nstate was {1}, and the most " + 
       "popular region was, surprisingly, {2}.").format(post_count_total_raw,
                                                        post_count_by_state_raw.index[0],
                                                        post_count_by_region_raw.index[0],
                                                        len(post_count_by_region_raw),
                                                        len(post_count_by_state_raw))



38,692 total posts exctracted from 416 regions over 52 state. The most popular
state was California, and the most popular region was, surprisingly, denver, CO.


In [675]:
# some preprocessing to check data corrupted files
assert len(usa_raw["state"].unique()) == 52
len(usa_raw["region"].unique()) == 416

len(usa_raw["subregion"].unique()) + len(usa_raw["region"].unique())

82

In [0]:
#
# How responsible is trump for my corrupted data?
#

In [667]:
#
# US census data for 2010 from the census bureau
#

# Census data is not labelled exactly as my data is. Some states are named a little differently,
# and regions are almost never named similarly. These have to be resolved.

census = pd.read_csv("data/census/DEC_10_DP_DPDP1_with_ann.csv")[1:]
# keys for the census data. Only really care about two of them (there are hundreds):
TOT_NUM_ID = "HD01_S001" # total number key
TOT_PER_ID = "HD02_S001" # total percent key

# Keys for geography stuff. Table is an index table.
# These keys are used as index for census table.
census_keys = pd.read_csv("data/census/DEC_10_DP_G001_with_ann.csv")[1:]
GEO_KEY = "GEO.display-label"
GEO_ID = "GEO.id"
# keys used to reference states in census
census_states_keys = dict(zip(list(census_keys[GEO_KEY]), list(census_keys[GEO_ID][:52])))

print "Sample of census keys, if curious:"
zip(list(census_states_keys), list(census_states_keys.values()))[:5]

census[:5]


Sample of census keys, if curious:


        GEO.id GEO.id2 GEO.display-label        HD01_S001 HD02_S001 HD01_S002  \
1  0400000US01      01           Alabama  4779736(r38235)     100.0    304957   
2  0400000US02      02            Alaska   710231(r38823)     100.0     53996   
3  0400000US04      04           Arizona          6392017     100.0    455715   
4  0400000US05      05          Arkansas  2915918(r39193)     100.0    197689   
5  0400000US06      06        California         37253956     100.0   2531333   

  HD02_S002 HD01_S003 HD02_S003 HD01_S004    ...    HD01_S182 HD02_S182  \
1       6.4    308229       6.4    319655    ...      3311304    ( X )    
2       7.6     50887       7.2     50816    ...       448438    ( X )    
3       7.1    453680       7.1    448664    ...      4134117    ( X )    
4       6.8    196877       6.8    197559    ...      1929218    ( X )    
5       6.8   2505839       6.7   2590930    ...     20742929    ( X )    

  HD01_S183 HD02_S183 HD01_S184 HD02_S184 HD01_S185 HD02_S185 

In [641]:
#
# Standardizing census and cl data names This may have to be limited
# to states names. Regions will be, at the very least, a huge pain.
# Most likely will be unresolvable.
#
misnamed_states = []
for name in census_states_keys:
    if post_count_by_state_raw[name] < 0: misnamed_states.append(state)

# Standarize top city names

KeyError: 'Puerto Rico'

In [588]:
#
# ascii vs. unicode
#

def check_ascii(post):
    """
    Determines whether a title is properly encoded.
    """
    title = post["title"]
    try:
        title.encode('ascii')
        return True
    except UnicodeError:
        return False

ascii_titles_tv = usa_raw.apply(check_ascii, axis=1)
nonascii_posts = usa_raw[~ascii_titles_tv]

distinct_states = nonascii_posts["state"].unique()
print ("\n{0:,} of {1:,} total posts were non-ascii ({2:.2f}%), confined to {3} "
       + "states.").format(len(nonascii_posts),
                       total_posts_raw,
                       len(nonascii_posts)/float(total_posts_raw) * 100,
                       len(distinct_states))



199 of 38,981 total posts were non-ascii (0.51%), confined to 23 states.


In [589]:
#
# ascii vs. unicode
#

nonascii_states_count = nonascii_posts.groupby(
    "state").title.nunique().sort_values(ascending=False)
print "\nTop ten most popular unicode states:"
print nonascii_states_count[:10]

pennsylvania = nonascii_posts[nonascii_posts["state"] == "Pennsylvania"]
print pennsylvania["title"].tolist()[0]

print("\nA single Trump memester seems to be responsible for the chaos " +
      "in Pennsylvania.\n" + "I suspect that these crazy unicode posts " +
      "are mostly done by a very small\nset of people, though there is " +
      "no way to tell.")
print "\nRandom sample of 5 non-ascii Pennsylvania posts"
print pennsylvania["title"][:5]



Top ten most popular unicode states:
state
Pennsylvania    18
Maryland         8
New York         7
California       6
Arizona          5
Florida          5
Washington       4
Texas            4
Colorado         3
Connecticut      3
Name: title, dtype: int64
ðŸ™ŠðŸ™‰The ZOMBIES are comingðŸ™ŠðŸ™‰

A single Trump memester seems to be responsible for the chaos in Pennsylvania.
I suspect that these crazy unicode posts are mostly done by a very small
set of people, though there is no way to tell.

Random sample of 5 non-ascii Pennsylvania posts
18505                     ðŸ™ŠðŸ™‰The ZOMBIES are comingðŸ™ŠðŸ™‰
18514    ðŸ‘‘HAPPY NEW YEARSðŸ‘‘ America ðŸ‘‘ DONALD J.TRUMPðŸ‘‘
18515    ðŸŽ€HAPPY NEW YEARðŸŽ€ AMERICA ðŸ‘‘ DONALD J. TRUMPðŸ‘‘
18530              ðŸ’¥DONALD J. TRUMPðŸ’¥[Need a Tissue Anyone]
18540                                     ðŸ—½Keep on CryingðŸ—½
Name: title, dtype: object


In [536]:
state_patronage = usa.groupby('state').count()["title"].sort_values(ascending=False)
region_patronage = usa.groupby('region').count()["title"].sort_values(ascending=False)

#
# INCLUDE NORMALIZED FIGURES (posts per capita)
#

print "\nTop ten most popular states"
print state_patronage[:10]

# Denver is about 10 times as populous as nyc, for example
print "\nTop ten most popular regions"
print region_patronage[:10]

print "\n\n{0} regions in Colorado".format(usa[usa['state'] == "Colorado"]["region"].nunique())



Top ten most popular states
state
California      3801
Florida         3585
Texas           3155
New York        2358
Colorado        2066
Pennsylvania    1881
Washington      1400
Ohio            1396
Arizona         1381
Michigan        1359
Name: title, dtype: int64

Top ten most popular regions
region
denver, CO               1271
new york city            1023
seattle-tacoma            883
pittsburgh, PA            801
phoenix, AZ               790
south florida             750
los angeles               728
minneapolis / st paul     700
dallas / fort worth       681
SF bay area               679
Name: title, dtype: int64


7 regions in Colorado


In [0]:
def occurance(s, word):
    """
    Return number of occurances of word in s. Include properties.
    """
    # completely lowercase
    # first letter capitalized
    # partially capitalized
    # completely capitalized

occurance("this IS a senTence")

def capitalization(s, word):
    """
    Return qualities of capitization of word in s
    """
    