## Initializing Python

In [1]:
#!/usr/bin/env python
# -*- coding: UTF-8

In [2]:
# IMPORTING KEY PACKAGES
import csv # for reading in CSVs and turning them into dictionaries
import re # for regular expressions
import os # for navigating file trees
import nltk # for natural language processing tools
import pandas # for working with dataframes
import numpy as np # for working with numbers

In [3]:
# FOR CLEANING, TOKENIZING, AND STEMMING THE TEXT
from nltk import word_tokenize, sent_tokenize # widely used text tokenizer
from nltk.stem.porter import PorterStemmer # an approximate method of stemming words (it just cuts off the ends)
from nltk.corpus import stopwords # for one method of eliminating stop words, to clean the text
stopenglish = list(stopwords.words("english")) # assign the string of english stopwords to a variable and turn it into a list
import string # for one method of eliminating punctuation
punctuations = list(string.punctuation) # assign the string of common punctuation symbols to a variable and turn it into a list
from scipy.stats.stats import pearsonr

In [4]:
# FOR VISUALIZATIONS
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Visualization parameters
% pylab inline 
% matplotlib inline
matplotlib.style.use('ggplot')

Populating the interactive namespace from numpy and matplotlib


## Reading in preliminary data

In [5]:
# Read the newly merged data in as a pandas dataframe
df = pandas.read_csv("merged.csv", encoding = 'Latin-1')
df = df.dropna(subset=["WEBTEXT"]) # drop any schools with no webtext that might have snuck in (none currently)

## Using PVI Scores
Here we'll use Partisan Voting Index scores instead of voting records to assess political preference
PVI scores are based on congressional district, but the data we have does not say which congressional district a school belongs to.  So, we will use zip codes to map schools to their congressional districts.

In [6]:
def strip_leading_zero(s):
    if s[0] == "0" and len(s) > 1:
        return s[1:]
    return s

# Get mapping from zip code to congressional district
zip_dict = dict()
with open("zipcode_to_cd.txt") as f:
    line = f.readline().replace("\n", "")
    while line:
        line = line.split(",")
        line = [strip_leading_zero(el) for el in line]
        zip_dict[line[1]] = line[0] + "," + line[2]
        line = f.readline().replace("\n", "")

# Get mapping from congressional district to PVI score
# Note that negative scores indicate a Democratic score and positive scores indicate a Republican score
pvi_dict = dict()
with open("pvi_by_county.txt") as f:
    line = f.readline().replace("\n", "")
    while line:
        line = line.split()
        line = [strip_leading_zero(el) for el in line]
        pvi_dict[line[0] + "," + line[1]] = line[2]
        line = f.readline().replace("\n", "")

In [7]:
# Get Berkeley's PVI score
pvi_dict[zip_dict['94709']]

'-37'

In [8]:
# Map schools to their PVI scores and store in a "PVI" column
pvis, not_found, = [], 0
for index, row in df.iterrows():
    if str(row["LZIP"]) not in zip_dict:
        pvis.append(0) # For now defaulting to 0 when zip code not found
        not_found += 1
    else:
        pvi_value = pvi_dict[zip_dict[str(row["LZIP"])]]
        pvis.append(int(pvi_value))
print(str(not_found) + " zip codes not found")
df["PVI"] = pvis
df[["SCHNAM", "PVI"]][:10]

7 zip codes not found


Unnamed: 0,SCHNAM,PVI
0,RICHLAND TWO CHARTER HIGH,-21
1,POLK STATE COLLEGE COLLEGIATE HIGH SCHOOL,-8
2,RIVER CITY SCHOLARS CHARTER ACADEMY,4
3,DETROIT ENTERPRISE ACADEMY,-29
4,LIGHTHOUSE COMMUNITY SCH INC,8
5,WESTLAKE CHARTER MIDDLE,-18
6,VAN GOGH CHARTER,-14
7,SUMMIT ACADEMY TRANSITION HIGH SCHOOL DAYTON,3
8,WESTCHESTER ACADEMY FOR INTERNATIONAL STUDIES,13
9,CITY ACADEMY,-11
