# String Cleaning Script

Author(s): Krutika Ingale <br>
Project Manager: Jaren Haber, PhD Candidate <br>
Contact: jhaber@berkeley.edu

Institution: University of California, Berkeley <br>
Program: Undergraduate Research Apprentice Program (URAP) <br>

Date created: <br>
Last modified: 11/20/18

Description: Notebook to clean strings that are in sentence format or in any format.

In [1]:
#necessary imports 
import csv
import pandas as pd
import os
import re
import string
import collections


import numpy as np
from itertools import groupby
from sklearn.feature_extraction import text
import nltk
from nltk.corpus import stopwords
import datetime


folder_prefix = '/home/jovyan/work/'


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:
#read in data frame with strings for testing
cleaned_df = pd.read_pickle(folder_prefix + 'nowdata/charters_final_2015.pkl')

In [79]:
#string for testing
messy_string = cleaned_df['CMO_WEBTEXT'][39][0][3]

In [2]:
# Create list of punctuation
import string # for one method of eliminating punctuation
punctuations = list(string.punctuation) # assign list of common punctuation symbols
punctuations+=['*','•','©','–','–','``','’','“','”','...','»',"''",'..._...','--','×','|_','_','§','…','⎫'] # Add a few more punctuations also common in web text
punctuations.remove('-') ; punctuations.remove("'")
punctuations

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 '(',
 ')',
 '*',
 '+',
 ',',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 '*',
 '•',
 '©',
 '–',
 '–',
 '``',
 '’',
 '“',
 '”',
 '...',
 '»',
 "''",
 '..._...',
 '--',
 '×',
 '|_',
 '_',
 '§',
 '…',
 '⎫']

In [3]:
stop_word_list = list(set(stopwords.words('english')))

#stop_word_list consists of dates and some repetitive words
for i in range(1,13):
    stop_word_list.append(datetime.date(2008, i, 1).strftime('%B'))
for i in range(1,13):
    stop_word_list.append((datetime.date(2008, i, 1).strftime('%B')).lower())
for i in range(1, 2100):
    stop_word_list.append(str(i))
stop_word_list.append('00') 
stop_word_list.append('el')
stop_word_list.append('en')
stop_word_list.append('la')
stop_word_list.append('los')
stop_word_list.append('para')
stop_word_list.append('las')
stop_word_list.append('san')
stop_word_list.append('mr')
stop_word_list.append('mrs')
stop_word_list.append('sa')
# stop_word_list.append('angeles')
# stop_word_list.append('diego')
# stop_word_list.append('california')
# stop_word_list.append('york')
stop_word_list.append('fax')
stop_word_list.append('email')
stop_word_list.append('phone')
# stop_word_list.append('harlem')
# stop_word_list.append('bronx')
# stop_word_list.append('wi')
stop_word_list.append('am')
stop_word_list.append('pm')
stop_word_list.append('org')
stop_word_list.extend(['Menu', 'Contact Us', 'Facebook', 'Calendar', 'Lunch', 'Breakfast', 'FAQs', 'FAQ'])
# stop_word_list.append('city')
# stop_word_list.append('austin')
# stop_word_list.append('antonio')
# stop_word_list.append('texas')
#adding states abbreviations for stopwords
# states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
#           "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
#           "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
#           "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
#           "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV"]
# for state in states:
#           stop_word_list.append(state)
# for state in [state.lower() for state in states]:
#           stop_word_list.append(state)

#we have text.English stopwords and nltk's english stop words
stop_words_extra = text.ENGLISH_STOP_WORDS.union(stop_word_list)

In [None]:
#loading in some words that should be removed from a string  #no rem_words 
filename = folder_prefix + 'Charter-school-identities/data/wem_vocab_train250_nostem_unlapped_300d.txt'
with open(filename) as f:
    rem_words = f.read().splitlines() 


In [11]:
rem_words = rem_words[:8511]

In [12]:
#rem_words now includes a list of words to remove and a list of stop words
rem_words.append(stop_words_extra)

####  Clean_string:
clean_string() takes in a string in any format, and a list of unicode character, & a list of words to remove and:
- removes all numbers, emails, some urls, unicode characters, hex characters, and punctuation

In [101]:
unicode_list  = []
for i in range(1000,3000):
    unicode_list.append(chr(i))
    
def clean_string(messy_string, rem_words, unicode_list, givepunct): 
    s = re.sub(r"\\x.*|\\u.*|\\b.*|-|\u2605$", "", messy_string.replace(u"\xa0", u" ").replace(u"\\t", u" ").strip(" "))
    s = re.sub(r'[^\x00-\x7f]',r'', s) #got rid of hex character like \xa0\ adn a\x80
    
    s = re.sub(r'^https?:\/\/.*[\r\n]*', '', s, flags=re.MULTILINE) #gets rid of urls in forms https://
    #it's hard to get rid of urls when they aren't separated by space
    s = re.sub(r'|'.join(map(re.escape, unicode_list)), '', s) #should remove unicode characters
    li_text = []
    for word in re.split('\n|\t|\*|\r|\s', s): #split the string into a list of substrings split on the regex characters
        #gets rid of the rem_words, numbers, and emails
        if ((word not in (rem_words)) and (not word.isdigit()) and ("@" not in word)) :
            li_text.append(re.sub(r'['+givepunct+']', r'', word))
    s = ' '.join(li_text) #join strings together
    s = ''.join(i for i in s if not i.isdigit()) #2nd check to remove numbers
    s = re.sub(' +', ' ',s) #remove duplicate whitespace
    s = s.strip() #remove whitespace at beginning and end of string
    return s

####  clean_sentence:

clean_sentence() takes in a sentence with whitespaces in between words, and a list of unicode character and:
- removes all numbers, emails, urls, unicode character, hex characters, and punctuation

In [75]:
def clean_sentence(messy_string, unicode_list): #get rid of all numbers , use rem_words, use stop_words list
    """Removes numbers, emails, URLs, unicode characters, hex characters, and punctuation from a sentence 
    separated by whitespaces. Returns a tokenized, cleaned list of words from the sentence.
    
    Args: 
        Sentence, i.e. string that possibly includes spaces and punctuation
    Returns: 
        Cleaned & tokenized sentence, i.e. a list of cleaned, lower-case, one-word strings"""
    
    
    #replace \\x, \\u, \\b, or - followed by any character or anything that ends with \u2605
    #then replace \\x, \\t and then get rid of whitespace
    s = re.sub(r"\\x.*|\\u.*|\\b.*|-|\u2605$", "", messy_string.replace(u"\xa0", u" ").replace(u"\\t", u" ").strip(" "))
    
    #get rid of hex character like \xa0\ adn a\x80
    s = re.sub(r'[^\x00-\x7f]',r'', s) #replace anything that starts with a hex character 
    
    #code that basically removes all elements that appear in the unicode_list (looks like r'u1000|u10001|')
    s = re.sub(r'|'.join(map(re.escape, unicode_list)), '', s) #removes unicode
    li_text = []
    
    for word in re.split('\s', s): #splits by space 
        if ((not word.isdigit()) and ("@" not in word)) : #gets rid of the numbers and emails
        
            #gets rid of urls 
            if ((not word.startswith(('http', 'https', 'www'))) and (not word.endswith(('.com', '.net', '.gov', '.org')))):
                if ((not word.startswith('//')) and not word.endswith(('.jpg', '.pdf', 'png', 'jpeg', 'php'))): 
                                        
                    #gets rid of punctuation, and the words am and pm
                    li_text.append(re.sub(r'['+string.punctuation+']|am|pm', r'', word))
    s = ' '.join(li_text)#joins all words together
    s = s.strip() #remove beginning and ending white space in string
    return s

### Example tests below:

In [110]:
punctuations = list(string.punctuation)
punctuations.remove('-')
punctuations.remove("'") 
#addpuncts = ['*','•','©','–','–','`','’','“','”','»','-','×','|','_','§','…','⎫']
#punctuations += addpuncts # Add to punctuations list

punctstr = "".join([char for char in list(set(punctuations))])
punctstr

';|(."$=]^[<>#)~?{%,&`*\\}_+:/!@'

In [118]:
astring = "fantasmagorium."
astring.translate(None, "fm.")

TypeError: translate() takes exactly one argument (2 given)

In [112]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [154]:
st = 'This sentence //asiancelebration.org could---contai__n words, \xa0https://.org www//morphius.com 17 letters, also the date is 07-18-1996 we love //blackstonevalleyprep.org/wp-content/uploads/2016/12/video-thumbnails-copy.jpg children ' 

clean_sentence(st, unicode_list) #is the lower_case ok? #\n - sentence


'This sentence couldcontain words  letters also the date is we love children'

In [111]:
clean_string(messy_string, stop_word_list, unicode_list, punctstr) #is the lower_case ok? #\n - sentence
#get rid of punctuation in a word, underscores, equal sign, dashes, asterik


'Center Academic Success: K Higher Standards. Higher Expectations. Home About CAS School & Menus School Wellness Policy Parents Right KnowTeacher Qulifications Proposed Budget FY Budget Adoption Hearing Notice Sierra Vista Campus SV Elementary School SV Middle School SV High School A+ Anywhere Login SV Administration Douglas Campus Douglas Elementary School Douglas Middle School Douglas High School Douglas Administration Board Directors Forms & Resources Parent/Student Handbook Elementary/Middle School Parent/Student Handbook High School Transcript / Diploma Request New Student Applications Returning Student Application PowerSchool Login Tax Credit Before & After Care Clubs High School Club Schedule CyberPatriots Solar GoKart Team Employment Apply Now Contact Us Higher Standards. Higher Expectations. Get Ready! School Starts th! Mark calendars first day school Wednesday, th! School Calendars available website here. We holding annual Open House parents students th ::pm. Families drop ti

In [75]:
string 

'\t\t\t\t\tCenter for Academic Success:  K-12\t\t\t\t\nHigher Standards.  Higher Expectations.\nMenu\nHome\nAbout CAS\nFAQs\nSchool Calendar 2018-19\nBreakfast & Lunch Menus\nSchool Wellness Policy\nParentâ\x80\x99s Right to Know-Teacher Qulifications\nProposed Budget FY 2019\nBudget Adoption Hearing Notice\nSierra Vista Campus\nSV Elementary School\nSV Middle School\nSV High School\nA+ Anywhere Login\nSV Administration\nDouglas Campus\nDouglas Elementary School\nDouglas Middle School\nDouglas High School\nDouglas Administration\nBoard of Directors\nForms & Resources\nParent/Student Handbook â\x80\x93 Elementary/Middle School\nParent/Student Handbook â\x80\x93 High School\nTranscript / Diploma Request\nNew Student Applications\nReturning Student Application\nPowerSchool Login\nTax Credit 2017\nBefore & After Care Clubs\nHigh School Club Schedule\nCyberPatriots\nSolar Go-Kart Team\nEmployment\nApply Now\nContact Us\n\t\t\t\tHigher Standards.  Higher Expectations.\t\t\t\nGet Ready! Schoo

In [142]:
clean_string(cleaned_df['CMO_WEBTEXT'][123][0][3], rem_words, unicode_list ) #is the lower_case ok?

'Email Phone Gilbert Cpus Val Vista Ray Email Phone Power Cpus Power Chandler Heights Email Phone BFHS Germann Rittenhouse Email Phone Enroll Today Follow these easy steps to start the process to be part of the Benjin Franklin fily Complete this PreEnrollment Application Wait for your chosen cpuses to contact you Explore our website if you havent already Please note that this application is for the PreEnrollment process only Completion of this form does not guarantee enrollment Quick Links Home School News Calendar Links for Parents Fily Link JHHS Parent Handbook K Parent Handbook What is a Charter School Our Stories Enrollment Information Schedule a Tour Charger Voice Athletics Photo Galleries Employment Contact Us News Announcements Subscribe Summer Break Read More View All News Facebook Benjin Franklin Charter School K Events Jul First Day of School View All Events Next events arrow Previous events arrow Calendar Go to previous month July Go to next month Sun Mon Tue Wed Thu Fri Sat

In [139]:
cleaned_df['CMO_WEBTEXT'][123][0][3]

'Email: \nccenrollment@bfcsaz.com\n\rPhone: (480) 987-0722\nGilbert Campus\n\rVal Vista & Ray\nEmail: \ngcenrollment@bfcsaz.com\n\rPhone: (480) 632-0722\nPower Campus\n\rPower & Chandler Heights\nEmail: \npcenrollment@bfcsaz.com\n\rPhone: (480) 677-8400\nBFHS\n\rGermann & Rittenhouse\nEmail: \nhsenrollment@bfcsaz.com\n\rPhone: (480) 558-1197\n        Enroll Today!    \nFollow these easy steps to start the process to be part of the Benjamin Franklin family:\nComplete this\xa0Pre-Enrollment Application\nWait for your chosen campus(es) to contact you.\nExplore our website if you havenâ\x80\x99t already!\nPlease note that this application is for the Pre-Enrollment process only. Completion of this form does not guarantee enrollment.\nQuick Links\r\t\t\t\t\t\nHome\nSchool News\nCalendar\nLinks for Parents  \nFamily Link\nJH/HS Parent Handbook\nK-6 Parent Handbook\nWhat is a Charter School?\nOur Stories\nEnrollment Information\nSchedule a Tour\nCharger Voice\nAthletics\nPhoto Galleries\nEmplo

In [143]:
clean_string(cleaned_df['CMO_WEBTEXT'][905][0][3], rem_words, unicode_list) #is the lower_case ok?

'icopa Survey Leading Edge Academy Online Survey Enroll Now Click here to Enroll Leading Edge Academy is a network of K Tuitionfree Charter Schools with cpuses throughout the valley Were proud to offer advanced academics and character develoent in a technologyenriched environment Please have a look around to learn about the various cpuses and progrs that are a part of the Leading Edge Academy Network A glimpse of what youll find Tuition Free K Small Class Sizes Advanced Academics Free Full Day Kindergarten Gilbert Mesa Queen Creek and Maricopa Cpuses Online Academy In accordance with Federal Law Leading Edge Academy does not discriminate on the basis of race religion color national origin sex or disability Leading Edge Academy Home Staff Enrollment Contact Employment'

In [144]:
clean_string(cleaned_df['CMO_WEBTEXT'][900][0][3], rem_words, unicode_list) #is the lower_case ok?

's Great Hearts Gala Greater Good Fund Ways to Give Great Hearts Great Hearts is a nonprofit network of public charter schools dedicated to improving education nationwide through classical preparatory K academies A Great Hearts education prepares students to be more than just proficient test takers but rather to become greathearted leaders capable of success throughout their higher education and professional careers With a curriculum built upon a classical liberal arts tradition and a revolutionary approach to school itself Great Hearts cultivates the hearts and minds of students in the pursuit of Truth Goodness and Beauty Your browser does not support ifres Play video Become a Part of Our Community Great Hearts operates charter schools throughout Arizona and Texas with more schools opening soon Explore our academies or apply for enrollment Your Nearest Academy Search Schools Tour an Academy Sign Up Enroll at Great Hearts Apply Now Upcoming Events September Annual Great Hearts Gala Gre

In [145]:
cleaned_df['CMO_WEBTEXT'][900][0][3]

's\nGreat Hearts Gala\nGreater Good Fund\nWays to Give\nGreat Hearts\nGreat Hearts is a non-profit network of public charter schools dedicated to improving education nationwide through classical preparatory K-12 academies. A Great Hearts education prepares students to be more than just proficient test takers, but rather, to become great-hearted leaders capable of success throughout their higher education and professional careers. With a curriculum built upon a classical liberal arts tradition and a revolutionary approach to school itself, Great Hearts cultivates the hearts and minds of students in the pursuit of Truth, Goodness and Beauty.\nYour browser does not support iframes.\nPlay video\nBecome a Part of Our Community\n      Great Hearts operates 28 charter schools throughout Arizona and Texas, with more schools opening soon. Explore our academies or apply for enrollment.    \nYour Nearest Academy\nSearch Schools\nTour an Academy\nSign Up\nEnroll at Great Hearts\nApply Now\nUpcomin