In [31]:
def url_06(text,part):
    """
    URLs should not begin with a number

    Inputs needed: body
    """
    req = text[part]
    res = { 'ruleCode': 'Url-06', 'ruleResult': '', 'resultDesc': '' }

    url = req.split("/")[-2]
    res['ruleResult'] = 'PASS'
    if url.split("-")[0].isdigit():
        res['ruleResult'] = 'FAIL'
        res['resultDesc'] = 'URLs should not begin with a number'

    return res

url_06({'url': 'rappler.com/philippines/50-there-are-only-5-words-here/'},'url')

{'ruleCode': 'Url-06',
 'ruleResult': 'FAIL',
 'resultDesc': 'URLs should not begin with a number'}

In [29]:
"5a".isdigit()

False

In [26]:
def url_04(text,part):
    """
    URLs should not have punctuations

    Inputs needed: body
    """
    req = text[part]
    res = { 'ruleCode': 'Url-04', 'ruleResult': '', 'resultDesc': '' }

    url = req.split("/")[-2]
    res['ruleResult'] = 'PASS'
    for i in url.split("-"):
        if not i.isalnum():
            res['ruleResult'] = 'FAIL'
            res['resultDesc'] = 'URLs should not have punctuations'

    return res

url_04({'url': 'rappler.com/philippines/there-are-only-5-words-here/'},'url')

{'ruleCode': 'Url-04',
 'ruleResult': 'FAIL',
 'resultDesc': 'URLs should not have punctuations'}

In [24]:
"5rt".isalnum()

True

In [14]:
def url_01(text,part):
    """
    URLs should have 11 words max

    Inputs needed: body
    """
    req = text[part]
    res = { 'ruleCode': 'Url-01', 'ruleResult': '', 'resultDesc': '' }

    url = req.split("/")[-2]
    res['ruleResult'] = 'PASS'
    if len(url.split("-")) > 11:
        res['ruleResult'] = 'FAIL'
        res['resultDesc'] = 'URLs should have 11 words max'

    return res

url_01({'url': 'rappler.com/philippines/there-are-only-5-words-here/'},'url')

{'ruleCode': 'Url-01', 'ruleResult': 'PASS', 'resultDesc': ''}

In [10]:
def tagline_01(text,part):
    """
    Taglines follow a specified format ( "Name/Rappler.com" OR "with reports from Name/Rappler.com")

    Inputs needed: body
    """
    req = text[part]
    res = { 'ruleCode': 'Tagline-01', 'ruleResult': '', 'resultDesc': '' }

    tagline = ""
    res['ruleResult'] = 'PASS'
    if ("—" in req):
        tagline = req.split("—")[-1][1:]
    else:
        res['resultDesc'] = 'There was no tagline detected' # return pass if there is no dateline

    return tagline

tagline_01({'body': "Metro Manila, Philippines — Lorem ipsum — John Doe/Rappler.com"},'body')    

'John Doe/Rappler.com'

In [7]:
def dateline_03(text,part):
    """
    Metro Manila dateline should use MANILA, PHILIPPINES

    Inputs needed: body
    """
    req = text[part]
    res = { 'ruleCode': 'Dateline-03', 'ruleResult': '', 'resultDesc': '' }

    dateline = ""
    if "—" in req:
        dateline = req.split("—")[0][:-1]
        if "metro manila" in dateline.lower():
            res['ruleResult'] = 'FAIL'
            res['resultDesc'] = 'Metro Manila dateline should use MANILA, PHILIPPINES'
    else:
        res['ruleResult'] = 'PASS' # return pass if there is no dateline

    return res  

dateline_03({'body': "Metro Manila, Philippines — Lorem ipsum"},'body')    

{'ruleCode': 'Dateline-01',
 'ruleResult': 'FAIL',
 'resultDesc': 'Metro Manila dateline should use MANILA, PHILIPPINES'}

In [8]:
from nltk.tag import pos_tag
def head_10(text,part):
    """
    Head-10

    Inputs needed: headline
    """
    req = text[part]
    res = { 'ruleCode': 'Head-10', 'ruleResult': '', 'resultDesc': '' }

    wordlist = req.split(" ")
    colons = [i for i,n in enumerate(wordlist) if ":" in n]
    hyphens = [i for i,n in enumerate(wordlist) if "-" in n]
    checkNames = pos_tag([wordlist[j+1] for j in colons+hyphens])

    res['ruleResult'] = 'PASS'
    for i in checkNames:
        if i[1] == 'NNP':
            res['ruleResult'] = 'FAIL'
            res['resultDesc'] = 'Use an en-dash and not a hyphen or double quote when indicating a source'

    return res

head_10({'headline': "TEST ONLY Marcos Jr. wins presidency (DO NOT PUBLISH): Roderick"},'headline')

NN


{'ruleCode': 'Head-10', 'ruleResult': 'PASS', 'resultDesc': ''}

In [22]:
def head_11(text,part):
    """
    Head-11

    Inputs needed: headline
    """
    req = text[part]
    res = { 'ruleCode': 'Head-11', 'ruleResult': '', 'resultDesc': '' }

    wordlist = req.lower().split(" ")
    numbers = ['one','two','three','four','five','six','seven','eight','nine','ten',
               'eleven','twelve','thirteen','fourteen','fifteen','sixteen','seventeen','eighteen','nineteen',
               'twenty','thirty','fourty','fifty','sixty','seventy','eighty','ninety']
    
    res['ruleResult'] = 'PASS'
    for i in wordlist:
        if i in numbers:
            res['ruleResult'] = 'FAIL'
            res['resultDesc'] = 'Do not spell out numbers in the title. Use numerical digits instead.'
        if i == '0':
            res['ruleResult'] = 'FAIL'
            res['resultDesc'] += ' Spell out \"zero\" instead of using \"0\"'                        

    return res

head_11({'headline': "TEST ONLY Marcos Jr. wins presidency (DO NOT PUBLISH): zero Roderick"},'headline')

{'ruleCode': 'Head-11', 'ruleResult': 'PASS', 'resultDesc': ''}

In [29]:
def head_12(text,part):
    """
    Head-12

    Inputs needed: headline
    """
    req = text[part]
    res = { 'ruleCode': 'Head-12', 'ruleResult': '', 'resultDesc': '' }

    wordlist = req.split(" ")
    colons = [i for i,n in enumerate(wordlist) if ":" in n]
    checkCapitalization = [wordlist[j+1] for j in colons]

    res['ruleResult'] = 'PASS'
    for i in checkCapitalization:
        if i[0].isalpha() and i[0].islower():
            res['ruleResult'] = 'FAIL'
            res['resultDesc'] = 'Capitalize the next word after every colon'

    return res

head_12({'headline': "TEST ONLY Marcos Jr. wins presidency (DO NOT PUBLISH): roderick"},'headline')

{'ruleCode': 'Head-12',
 'ruleResult': 'FAIL',
 'resultDesc': 'Capitalize the next word after every colon'}