In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from collections import Counter
import string
import csv


In [2]:
file_name = 'charters_parsed_03-08.csv'

In [3]:
dir_prefix = '/home/jovyan/work/'
bigfile = dir_prefix + 'Charter-school-identities/data/' + file_name
keepcols = ['MEMBER', 'FTE', 'YEAR_OPENED', 'YEAR_CLOSED', 'TOTETH', 'PCTETH', 'LEA_NAME', 'STATENAME', 'TOTFRL', 'ALL_RLA00PCTPROF_1415', 'ALL_MTH00PCTPROF_1415', 'LOCALE', 'SCH_NAME', 'ADDRESS14', 'TITLEI', 'ESS_COUNT', 'PROG_COUNT', 'RIT_COUNT', 'ESS_STRENGTH', 'PROG_STRENGTH', 'WEBTEXT', 'KEYWORDS_TEXT', 'IDEOLOGY_TEXT'] # Not included yet: 'LON1516', 'LAT1516', 
#dfslice # Note that WEBTEXT is empty, but KEYWORDS_TEXT and IDEOLOGY_TEXT have text data! Feel free to play with these.

In [4]:
# Will only grab information from the used columns 
used_columns = ['IDEOLOGY_TEXT','NCESSCH', 'SCH_NAME'] # Use NCESSCH as row names when saving to DF and csv
bigdata_iter = pd.read_csv(bigfile, sep="\t", low_memory=False, encoding="utf-8", na_values={"TITLEI":["M","N"]}, iterator=True, chunksize=25, usecols=used_columns)

In [5]:
# Input: String
# Output: Counter
# Description: All of the punctuations will be taken out and separated by spaces. 
# All of the words will also be lower-cased. Will be used for ALL TEXT before 
# putting into the set. 
# This will return a counter with all the valid words counted 

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
whitelist = set('abcdefghijklmnopqrstuvwxy ABCDEFGHIJKLMNOPQRSTUVWXYZ')

def clean_text_counter(txt):
    word_count = Counter()
    try:
        txt = txt.translate(translator)
        txt = txt.lower()
        txt = ''.join(filter(whitelist.__contains__, txt))
        word_count.update(txt.split())
        return word_count
    except: 
        return word_count

In [7]:
sample_string = "\\x056Friendly School\\x12123\\x This is our website"
clean_text_counter(sample_string)

Counter({'is': 1,
         'our': 1,
         'school': 1,
         'this': 1,
         'website': 1,
         'x': 2,
         'xfriendly': 1})

In [6]:
# Input: df, Counter, Spec_val(Dictionary)
# Output: df 
# Description: Will update the CSV file with a new line and new column headers. 
# Will turn everything that was not ther into 0. The dictionary will have {School_name: , ID: .....}
# df: data_frame of the file
# counter: is Counter type with essentially a dictionary of the terms we want to add
# Spec_val: is Dictionary with values not in counter that we want to add. MUST EXIST IN CSV COLUMNS ALREADY 
#            For the purpose of adding school names and school ids. i.e {'SCH_NAME': 'MONTE VISTA HS'}

def write_new_columns(df, counter, spec_val):
        # Reda the csv file, get the column headers. Put them into a set and put the new values into a set. 
        # The difference are the new columns that we need ot add. 
        header = list(df.columns.values)
        header_set = set(header)
        counter_set = set(counter) # All of the words that are in ideology of this new school
        
        new_columns = counter_set - header_set # New columns that we need to add
        
        empty = [0 for x in range(df.shape[0])]
        for col_header in new_columns: 
            df[col_header] = empty
        
        # At this point we need to get the new headers. And create the new DataFrame. 
        row_val = []
        header = list(df.columns.values)
        for col_header in header: 
            if col_header in counter: 
                row_val.append(counter[col_header])
            elif col_header in spec_val: 
                row_val.append(spec_val[col_header])
            else: 
                row_val.append(0)
    
        temp = pd.DataFrame([row_val],columns=header)
        df = df.append(temp, ignore_index=True)
        
        return df 

#write_new_columns(df, Counter({'a':1}), {'first_name': 10101})

In [8]:
# Input: The file name, text field, specical field, 
# Output: None
# Description: Will counting the words in the in the interatir passed in. Will create and modify a CSV file. 
# df_iter: Pass the interator for the data_frame you want to get information from. 
# file_name: This will create a new file where we store all the data -> Might want 
#           to modify so we could continue modifying the file
# text_field: The field in the dataframe that we actually want to do the word counts on 
#                 string i.e 'IDEOLOGY_TEXT'
# spec_field: The special fields are the columns included that are not words we are counting
#                For example, we would want to include the school id and the school names. 
#                list of strings i.e ['NCESSCH', 'SCH_NAME']
# c: Counter: Just a integer on how many rows you want to analyze. This is meant for testing. 
# THIS WILL NOT WORK FOR BIG DATA SETS. SINCE FOR EVERY ROW IT HAS TO OPEN AND CLOSE THE CSV FILE

def process_data(df_iter, file_name, text_field, spec_fields, c): 
    counter = 0
    with open(file_name + '.csv', 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=spec_fields)
        writer.writeheader()
    
    df = pd.read_csv(file_name + '.csv')
    
    # Pass in data to add columns via counter
    while counter < c: 
        chunk = df_iter.get_chunk(1)
        
        spc_fld = {}
        for val in spec_fields: 
            spc_fld[val] = chunk[val].values[0]
            
        text = chunk[text_field].values[0]
        ideo = clean_text_counter(text)
        df = write_new_columns(df, ideo, spc_fld)
        counter += 1
        print(counter)
    
    df.to_csv(file_name + '.csv')
    
process_data(bigdata_iter, 'data', 'IDEOLOGY_TEXT', ['NCESSCH', 'SCH_NAME'], 100)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
