In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from collections import Counter
import string
import csv

In [2]:
file_name = 'charters_parsed_03-08.csv'

In [3]:
dir_prefix = '/home/jovyan/work/'
bigfile = dir_prefix + 'Charter-school-identities/data/' + file_name
keepcols = ['MEMBER', 'FTE', 'YEAR_OPENED', 'YEAR_CLOSED', 'TOTETH', 'PCTETH', 'LEA_NAME', 'STATENAME', 'TOTFRL', 'ALL_RLA00PCTPROF_1415', 'ALL_MTH00PCTPROF_1415', 'LOCALE', 'SCH_NAME', 'ADDRESS14', 'TITLEI', 'ESS_COUNT', 'PROG_COUNT', 'RIT_COUNT', 'ESS_STRENGTH', 'PROG_STRENGTH', 'WEBTEXT', 'KEYWORDS_TEXT', 'IDEOLOGY_TEXT'] # Not included yet: 'LON1516', 'LAT1516', 
#dfslice # Note that WEBTEXT is empty, but KEYWORDS_TEXT and IDEOLOGY_TEXT have text data! Feel free to play with these.

In [4]:
# Will only grab information from the used columns 
used_columns = ['IDEOLOGY_TEXT','NCESSCH', 'SCH_NAME'] # Use NCESSCH as row names when saving to DF and csv
bigdata_iter = pd.read_csv(bigfile, sep="\t", low_memory=False, encoding="utf-8", na_values={"TITLEI":["M","N"]}, iterator=True, chunksize=25, usecols=used_columns)

In [5]:
idlist = bigdata_iter.get_chunk(1)["IDEOLOGY_TEXT"]

In [6]:
list(idlist)[0]

"['Alaska Public Schools Database\\n', 'Lower Kuskokwim School District\\n', 'School Calendar for 2017-2018\\n', 'School ID\\n', 'School Website\\n', 'School Email\\n', 'School Details\\n', '                    Teacher Certification: \\n', 'Teaching & Learning Support Program Contacts\\n', 'Teaching & Learning Support Program Contacts\\n', 'Teacher Certification\\n']"

In [7]:
# Input: String
# Output: An array of words. 
# Description: All of the punctuations will be taken out and separated by spaces. 
# All of the words will also be lower-cased. Will be used for ALL TEXT before 
# putting into the set. 
# RESULT SHOULD BE PASSED INTO COUNT_IDELOGY

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
whitelist = set('abcdefghijklmnopqrstuvwxy ABCDEFGHIJKLMNOPQRSTUVWXYZ')

def clean_text(txt):
    try:
        txt = txt.translate(translator)
        txt = txt.lower()
        txt = ''.join(filter(whitelist.__contains__, txt))

        return txt.split()
    except: 
        return []

In [8]:
# Input: List
# Ouput: Counter with word count
# Description: Will take in a cleaned up list of words and output counter with all the words

def count_ideology(lst):
    word_count = Counter()
    word_count.update(lst)
    
    return word_count

In [9]:
# Input: CSV file, Counter, Spec_val(Dictionary)
# Output: None
# Description: Will update the CSV file with a new line and new column headers. 
# Will turn everything that was not ther into 0. The dictionary will have {School_name: , ID: .....}
# REQUIRES: CSV FILE TO ALREADY EXIST. THE SPEC_VAL COLUMNS MUST EXIST ALREADY
# csv_filename: is the file name of the file we are modifying 
# counter: is Counter type with essentially a dictionary of the terms we want to add
# Spec_val: is Dictionary with values not in counter that we want to add. MUST EXIST IN CSV COLUMNS ALREADY 
#            For the purpose of adding school names and school ids. i.e {'SCH_NAME': 'MONTE VISTA HS'}

def write_new_columns(csv_filename, counter, spec_val):
        # Reda the csv file, get the column headers. Put them into a set and put the new values into a set. 
        # The difference are the new columns that we need ot add. 
        df = pd.read_csv(csv_filename)
        header = list(df.columns.values)
        header_set = set(header)
        counter_set = set(counter)
        
        new_columns = counter_set - header_set
        
        empty = [0 for x in range(df.shape[0])]
        for col_header in new_columns: 
            df[col_header] = empty
        
        # At this point we need to get the new headers. And create the new DataFrame. 
        row_val = []
        header = list(df.columns.values)
        for col_header in header: 
            if col_header in counter: 
                row_val.append(counter[col_header])
            elif col_header in spec_val: 
                row_val.append(spec_val[col_header])
            else: 
                row_val.append(0)
    
        temp = pd.DataFrame([row_val],columns=header)
        df = df.append(temp, ignore_index=True)
        df.to_csv(csv_filename, index=False)

#write_new_columns("names.csv", Counter({'a':1}), {'first_name': 10101})

In [11]:
# Input: The file name, text field, specical field, 
# Output: None
# Description: Will counting the words in the in the interatir passed in. Will create and modify a CSV file. 
# df_iter: Pass the interator for the data_frame you want to get information from. 
# file_name: This will create a new file where we store all the data -> Might want 
#           to modify so we could continue modifying the file
# text_field: The field in the dataframe that we actually want to do the word counts on 
#                 string i.e 'IDEOLOGY_TEXT'
# spec_field: The special fields are the columns included that are not words we are counting
#                For example, we would want to include the school id and the school names. 
#                list of strings i.e ['NCESSCH', 'SCH_NAME']
# c: Counter: Just a integer on how many rows you want to analyze. This is meant for testing. 
# THIS WILL NOT WORK FOR BIG DATA SETS. SINCE FOR EVERY ROW IT HAS TO OPEN AND CLOSE THE CSV FILE

def process_data(df_iter, file_name, text_field, spec_fields, c): 
    counter = 0
    with open(file_name, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=spec_fields)
        writer.writeheader()
        
    while counter < c: 
        chunk = df_iter.get_chunk(1)
        
        spc_fld = {}
        for val in spec_fields: 
            spc_fld[val] = chunk[val].values[0]
            
        text = chunk[text_field].values[0]
        text = clean_text(text)
        ideo = count_ideology(text)
        write_new_columns(file_name, ideo, spc_fld)
        counter += 1
        print(counter)
    
process_data(bigdata_iter, 'data.csv', 'IDEOLOGY_TEXT', ['NCESSCH', 'SCH_NAME'], 20)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
