In [275]:
%matplotlib inline
import matplotlib as mtl
import numpy as np
import scipy
from scipy import stats
import pandas

import pprint as pp
import pickle

pd.options.display.max_colwidth = 1000

In [513]:
# read us data collected by craigcrawler
usa_raw = pd.read_csv("data/us.csv", index_col=0)
total_posts_raw = len(usa_raw)
state_post_count_raw = usa_raw.groupby("state").count()["title"].sort_values(ascending=False)
region_post_count_raw = usa_raw.groupby("region").count()["title"].sort_values(ascending=False)

print "\n{0:,} total posts exctracted.".format(total_posts_raw)


38,981 total posts exctracted.


In [514]:
# some preprocessing to remove corrupted files
def check_titles(post):
    """
    Determines whether a title is properly encoded.
    """
    title = post["title"]
    try:
        title.encode('ascii')
        return True
    except UnicodeError:
        return False

uncorrupted_titles_tv = usa_raw.apply(check_titles, axis=1)

usa = usa_raw[uncorrupted_titles_tv]
corrupt_posts = usa_raw[~uncorrupted_titles_tv]

distinct_states = corrupt_posts["state"].unique()

print ("\n{0:,} of {1:,} total posts were corrupted during extraction, "
       + "for a {2:.2f}% corruption rate,\nwith corruptions confined to {3} " +
       "states.").format(len(corrupt_posts),
                       total_posts_raw,
                       len(corrupt_posts)/float(total_posts_raw) * 100,
                       len(distinct_states))




199 of 38,981 total posts were corrupted during extraction, for a 0.51% corruption rate,
with corruptions confined to 23 states.


In [523]:
# some analysis of data corruption
corrupted_states_count = corrupt_posts.groupby("state").title.nunique().sort_values(ascending=False)
print "\nTop ten most corrupted states:"
print corrupted_states_count[:10]

pennsylvania = corrupt_posts[corrupt_posts["state"] == "Pennsylvania"]

print("\nA single Trump memester seems to be responsible for the chaos in Pennsylvania.\n" +
      "I suspect that these crazy corrupted unicode posts are mostly done by a very small\n" + 
      "set of people.")
print "\nRandom sample of 5 corrupt Pennsylvania posts"
print pennsylvania["title"][:5]



Top ten most corrupted states:
state
Pennsylvania    18
Maryland         8
New York         7
California       6
Arizona          5
Florida          5
Washington       4
Texas            4
Colorado         3
Connecticut      3
Name: title, dtype: int64

A single Trump memester seems to be responsible for the chaos in Pennsylvania.
I suspect that these crazy corrupted unicode posts are mostly done by a very small
set of people.

Random sample of 5 corrupt Pennsylvania posts
18505                     ðŸ™ŠðŸ™‰The ZOMBIES are comingðŸ™ŠðŸ™‰
18514    ðŸ‘‘HAPPY NEW YEARSðŸ‘‘ America ðŸ‘‘ DONALD J.TRUMPðŸ‘‘
18515    ðŸŽ€HAPPY NEW YEARðŸŽ€ AMERICA ðŸ‘‘ DONALD J. TRUMPðŸ‘‘
18530              ðŸ’¥DONALD J. TRUMPðŸ’¥[Need a Tissue Anyone]
18540                                     ðŸ—½Keep on CryingðŸ—½
Name: title, dtype: object


In [0]:
# Search for connection errors. A connection error would result in an entire region being lost.
# This can be done by 

In [0]:
#
# How responsible is trump for my corrupted data?
#

In [524]:
#
# US census data for 2010 from the census bureau
#
census = pd.read_csv("data/census/DEC_10_DP_DPDP1_with_ann.csv")[1:]

# keys 
census_keys = pd.read_csv("data/census/DEC_10_DP_G001_with_ann.csv")[1:]
GEO_KEY = "GEO.display-label"
GEO_ID = "GEO.id"
TOT_NUM_ID = "HD01_S001"
TOT_PER_ID = "HD02_S001"
census_states_keys = list(census_keys[GEO_ID][:52])

census[census[GEO_KEY] == census_states_keys[0]][TOT_NUM_ID]


Series([], Name: HD01_S001, dtype: object)

In [536]:
state_patronage = usa.groupby('state').count()["title"].sort_values(ascending=False)
region_patronage = usa.groupby('region').count()["title"].sort_values(ascending=False)

print "\nTop ten most popular states"
print state_patronage[:10]

# Denver is about 10 times as populous as nyc, for example
print "\nTop ten most popular regions"
print region_patronage[:10]

print "\n\n{0} regions in Colorado".format(usa[usa['state'] == "Colorado"]["region"].nunique())



Top ten most popular states
state
California      3801
Florida         3585
Texas           3155
New York        2358
Colorado        2066
Pennsylvania    1881
Washington      1400
Ohio            1396
Arizona         1381
Michigan        1359
Name: title, dtype: int64

Top ten most popular regions
region
denver, CO               1271
new york city            1023
seattle-tacoma            883
pittsburgh, PA            801
phoenix, AZ               790
south florida             750
los angeles               728
minneapolis / st paul     700
dallas / fort worth       681
SF bay area               679
Name: title, dtype: int64


7 regions in Colorado
