### General Set Up

In [1]:
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import requests


In [13]:
## Creating XML Tree Object ##

TARGET = 'servicestatus.json'
SOURCE = 'http://web.mta.info/status/serviceStatus.txt'

response = requests.get(SOURCE)
xml_string = response.text
root = ET.fromstring(xml_string)
# ET.fromstring converts a string of XML content to a XML tree object.


In [None]:
# Save XML for later use. 

save_xml = open('ServStatuses/ServStat .txt', 'w')
save_xml.write(xml_string)
save_xml.close()

In [None]:
# Open saved XML as current service status 
xml_saved = open('ServStatuses/ServStat SC+Delays.txt','r')
xml_saved_str = str(xml_saved.read())
root_saved = ET.fromstring(xml_saved_str)


### Function to intake xml root and return full list of MTA lines present in the XML. 
(Should always be the same.) 

In [3]:
def getFullLineList(xmlroot):
    FullLineList = []
    num = 0
    
    while num < len(root[2]):
        FullLineList.append((num,root[2][num][0].text))
        num += 1
        
    return FullLineList

FullLineList = getFullLineList(root)
FullLineList

[(0, '123'),
 (1, '456'),
 (2, '7'),
 (3, 'ACE'),
 (4, 'BDFM'),
 (5, 'G'),
 (6, 'JZ'),
 (7, 'L'),
 (8, 'NQR'),
 (9, 'S'),
 (10, 'SIR')]

### Function to intake xml root and return date & time of service status. 
 

In [4]:

def get_ServStatus_timestamp(xml_string):
    #input xml_string must be the xml from MTA Service Status site in string format
    
    ServStatus_soup = BeautifulSoup(xml_string, 'lxml')
    timestamp = ServStatus_soup.find_all('timestamp')
    timestamp_content_list = timestamp[0].contents
    
    return timestamp_content_list[0]


# test with either xml_string or xml_saved_str
get_ServStatus_timestamp(xml_string)

'12/9/2017 3:02:28 PM'

### Function to intake service status xml object and return lines with service changes as a list of tuples

In [14]:
  
def Return_SC_Lines(xmlroot):
    # xmlroot must be an xml object of the MTA Service Status page using xml Element Tree package
    num_lines = list(range(0,len(xmlroot[2])))
    SC_lines = []
    
    for ea in num_lines:
        if xmlroot[2][ea][1].text != 'GOOD SERVICE':
            #UPDATE 2017-12-05: below line should be updated in all code to have xmlroot twice
            SC_lines.append((xmlroot[2][ea][0].text,xmlroot[2][ea][1].text))
        else:
            pass 
        
    #SC_Lines is a list of tuples with the line name and service status 
    return SC_lines
    
SC_lines = Return_SC_Lines(root)
SC_lines

[('123', 'PLANNED WORK'),
 ('ACE', 'PLANNED WORK'),
 ('BDFM', 'PLANNED WORK'),
 ('JZ', 'PLANNED WORK'),
 ('NQR', 'DELAYS'),
 ('S', 'PLANNED WORK')]

### Function to take line name and service status XML and return beautiful soup of that line's service status 

In [6]:
def MakeLine_ServiceStatusSoup(line_name,xmlroot): 
    line_iq = line_name
    line_iq_num = ''
    
    for ea in FullLineList:
        if ea[1] == line_iq:
            line_iq_num = ea[0]
            
    # get string and make soup from line in question's status
    #UPDATE 2017-12-05: below line should be updated in all code to have xmlroot not root
    status_str = xmlroot[2][line_iq_num][2].text
    linestatus_soup = BeautifulSoup(status_str, 'lxml')
    
    return linestatus_soup


# bdfmsoup = MakeLine_ServiceStatusSoup('BDFM',root)
    
    
    

### Function intake service status xml and a line with planned work and return description of the planned work

In [12]:
def plannedWork_Simple(line_servicestatus_soup):
    
    tag_options = ['span']
    classname_options = ['TitlePlannedWork']
    subtag_options = ['p']
    pwSimple_text = ''

    for spans in line_servicestatus_soup.find_all(tag_options, {'class':classname_options}):
        subs = spans.find_all_next(subtag_options)
        for each in subs:
            pwSimple_text += each.text
            
    return [pwSimple_text]


# this works for printing summary line, and not when planned work is a span

def plannedWork_Detail(line_servicestatus_soup):
    
    tag_options = ['a']
    classname_options = ['plannedWorkDetailLink']
    subtag_options = ['div','p']
    pwDetail_text_list = []
    
    for a_summ in line_servicestatus_soup.find_all(tag_options, {'class':classname_options}):
        pw_summtext = a_summ.text

        #print(pw_summtext,'\n', next_div, '\n')
        pwDetail_text_list.append(pw_summtext)
    
    return pwDetail_text_list
        


def PlannedWorkText(line_name,xmlroot):
    
    line_status_soup = MakeLine_ServiceStatusSoup(line_name,xmlroot)
    service_string = ''
    pw_text_list = []
    
    line_iq = line_name
    line_iq_num = ''
    
    for ea in FullLineList:
        if ea[1] == line_iq:
            line_iq_num = ea[0]
        
        
    if len(line_status_soup.find_all('a', {'class':'plannedWorkDetailLink'})) < 1:
        pw_text_list = plannedWork_Simple(line_status_soup)
            
    else:
        pw_text_list = plannedWork_Detail(line_status_soup)

        
        
    return [line_name, xmlroot[2][line_iq_num][1].text, pw_text_list]
    
    
for delt in range(len(PlannedWorkText('ACE',root)[2])): print(PlannedWorkText('ACE',root)[2][delt])
 

TRACK MAINTENANCE[E] World Trade Center-bound trains skip 23 St and Spring St
COMMUNICATION IMPROVEMENTS[C] No trains between 145 St and 168 St[A] trains provide alternate service
TRACK MAINTENANCE[C] Manhattan-bound trains run express from Broadway Junction to Hoyt-Schermerhorn
TRACK MAINTENANCE[C] Euclid Av-bound trains skip 50 St, 23 St and Spring St
TRACK MAINTENANCE[C] Uptown trains run express from Canal St to 145 St
COMMUNICATION IMPROVEMENTS[A] Trains make local stops in both directions at 155 St and 163 St
COMMUNICATION IMPROVEMENTS[A] No trains between 168 St and 207 St
TRACK REPLACEMENT[A] No trains between Broad Channel and Mott Av


### Function to intake service status xml and a line name and return text of delays on that line, if applicable 

In [16]:
def delays_text(line_name,xmlroot):

    line_status_soup = MakeLine_ServiceStatusSoup(line_name,xmlroot)
    delay_string = ''
    
    delayline_name = line_name
    dealyline_num = ''
    
    for ea in FullLineList:
        if ea[1] == delayline_name:
            dealyline_num = ea[0]


    del_text = ''
    
    for del1 in line_status_soup.find_all('span', {'class': 'TitleDelay'}):
        
        # below checks that the delay description is contained in <p> tags
        if len(del1.find_all('p')) > 0:
            
            delay_deets = del1.find_all('p')
            
            for dels in delay_deets:
                del_text += dels.text + ' '
            
            
        else:
            del_text = ''.join(line_status_soup.find_all(text=True)[3:5]).strip()
            
            
    return del_text
            
            

print(delays_text('NQR',root))


[N] and [R] train service has resumed following an earlier incident involving a train with mechanical problems at 8 St -NYU.


### Function to intake service status xml and a line name and return text of "service change" on that line, if applicable 

In [None]:
def servChng_text(line_name,xmlroot):

    line_status_soup = MakeLine_ServiceStatusSoup(line_name,xmlroot)
    servChng_string = ''
    
    servChng_name = line_name
    servChng_num = ''
    
    for ea in FullLineList:
        if ea[1] == servChng_name:
            servChng_num = ea[0]


    servChng_text = ''
    
    for servChng1 in line_status_soup.find_all('span', {'class': 'TitleServiceChange'}):
        
        # below checks that the delay description is contained in <p> tags
        if len(servChng1.find_all('p')) > 0:
            
            servChng_deets = servChng1.find_all('p')
            
            for sc in delay_deets:
                servChng_text += sc.text + ' '
            
            
        else:
            servChng_text = ''.join(line_status_soup.find_all(text=True)).strip()
            
            
    return servChng_text
            
            

print(servChng_text('ACE',root))


In [None]:
print(root[2][3][2].text)

In [None]:
print(xml_string)