# RegEx assignment
## Parse and validate German phone numbers and email addresses

In [1]:
# Check if running correct version

import sys
if sys.version_info.major != 3:
    raise ValueError("You must use Python 3.")
if sys.version_info.minor < 4 :
    raise ValueError("You must use at least Python 3.4")
if sys.version_info.minor < 6:
    print("Recommended Python Version is 3.6")

## 1.1 Phone Numbers

The German country code is always _+49_ or _0049_. If a country code is given, the next number cannot be a _0_. If this number starts with a _1_, it is a mobile phone. In this case, the next three numbers (with the _1_) are the area code. The remaining numbers are the number.
If it is not a mobile number, just use the remaining numbers. (Meaning: No area code detection, if not a mobile number)

In [2]:
test_phone_numbers = [
    "+49174321324",
    "0164883423",
    "0049(0)16483311724",
    "+49-8332-3010",
    "00498513994",
    "0164 5554454",
    "0851 509",
    "0851 509-0",
    "(0851) 3394"
]

In [3]:
gold_phone_numbers = [
    "+49 174 321324",
    "+49 164 883423",
    "+49 164 83311724",
    "+49 83323010",
    "+49 8513994",
    "+49 164 5554454",
    "+49 851509",
    "+49 8515090",
    "+49 8513394"
]

In [4]:
def validate_phone(parsed_numbers):
    for parsed, test, gold in zip(parsed_numbers, test_phone_numbers, gold_phone_numbers):
        if parsed != gold:
            print("WRONG   → \n\t Input : {}\n\t Gold  : {}\n\t Parsed: {}".format(test, gold, parsed))
        else:
            print("CORRECT → \n\t Input : {}\n\t Gold  : {}\n\t Parsed: {}".format(test, gold, parsed))

### add your code in the next cell

You should produce a list _parsed_ of the parsed phone numbers.
When calling _validate_phone(parsed_numbers)_ it should only print CORRECT.

In [5]:
import re
import nltk

#Function to print numbers based on country code
def disp(x,test_num):
    if re.search(r'^([1])',test_num): #Mobile Number -> Display as "+49 <AreaCode> <Number>"
        test_num = ('+49 '+ test_num[0:3] +' '+ test_num[3:])
        return(test_num)
    elif re.search(r'^([2-9])',test_num): #Landline Number -> Display as "+49 <Number>"
        test_num = ('+49 '+ test_num[0:])
        return(test_num)
    else:
        test_num = (x+'(0)'+test_num) 
        return(test_num)

parsed_numbers = []
test_phone_numbers_trimmed = [test_num.replace(' ', '').replace('-','') for test_num in test_phone_numbers]

for test_num in test_phone_numbers_trimmed:
    if re.search(r'^(\(*\+\)*\(*[4][9]\)*([1-9]|(\([0]\))|\()[0-9]{,11})',test_num): #RegEx to search Phone Numbers with country code +49 or +49(0) or (+49)
        x = '+49'
        test_num = re.sub(r"^\(*\+\)*\(*[4][9]\)*\(*[0]*\)*", "", test_num).replace('(', '').replace(')', '')
        test_num = disp(x,test_num)
    elif re.search(r'^(\(*[0]\)*\(*[1-9][0-9]{,15})',test_num): #RegEx to search Phone Numbers with country code 0 or (0)
        test_num = re.sub(r"^\(*[0]\)*","", test_num).replace('(', '').replace(')', '')
        test_num = disp(x,test_num)
    elif re.search(r'^(\(*[0]\)*\(*[0]\)*\(*[4][9]\)*([1-9]|(\([0]\))|\()[0-9]{,15})', test_num): #RegEx to search Mobile Numbers starting with 0049 or 0049(0)
        x = '0049'
        test_num = re.sub(r"^\(*[0]\)*\(*[0]\)*\(*[4][9]\)*\(*[0]*\)*", "",test_num).replace('(', '').replace(')', '')
        test_num = disp(x,test_num)
    else:
        test_num
    parsed_numbers.append(test_num)

validate_phone(parsed_numbers)

CORRECT → 
	 Input : +49174321324
	 Gold  : +49 174 321324
	 Parsed: +49 174 321324
CORRECT → 
	 Input : 0164883423
	 Gold  : +49 164 883423
	 Parsed: +49 164 883423
CORRECT → 
	 Input : 0049(0)16483311724
	 Gold  : +49 164 83311724
	 Parsed: +49 164 83311724
CORRECT → 
	 Input : +49-8332-3010
	 Gold  : +49 83323010
	 Parsed: +49 83323010
CORRECT → 
	 Input : 00498513994
	 Gold  : +49 8513994
	 Parsed: +49 8513994
CORRECT → 
	 Input : 0164 5554454
	 Gold  : +49 164 5554454
	 Parsed: +49 164 5554454
CORRECT → 
	 Input : 0851 509
	 Gold  : +49 851509
	 Parsed: +49 851509
CORRECT → 
	 Input : 0851 509-0
	 Gold  : +49 8515090
	 Parsed: +49 8515090
CORRECT → 
	 Input : (0851) 3394
	 Gold  : +49 8513394
	 Parsed: +49 8513394


## 1.2 Email Addresses

In [6]:
test_emails = [
    "peter.mueller@uni-passau.de",
    "peter dot mueller at uni-passau dot de",
    "peter.mueller(at)uni-passau.de",
    "peter.mueller (at) uni-passau.de",
    "nefullword@gw.uni-passau.de",
    "peter (dot) mueller (at) uni-passau (dot) de",
    "other.host@gmail.com",
    "special-chars (at) live.com"
]

In [7]:
gold_emails = [
    "peter.mueller@uni-passau.de",
    "peter.mueller@uni-passau.de",
    "peter.mueller@uni-passau.de",
    "peter.mueller@uni-passau.de",
    "nefullword@gw.uni-passau.de",
    "peter.mueller@uni-passau.de",
    "other.host@gmail.com",
    "special-chars@live.com"
]

In [8]:
def validate_email(parsed_emails):
    for parsed, test, gold in zip(parsed_emails, test_emails, gold_emails):
        if parsed != gold:
            print("WRONG   → \n\t Input : {}\n\t Gold  : {}\n\t Parsed: {}".format(test, gold, parsed))
        else:
            print("CORRECT → \n\t Input : {}\n\t Gold  : {}\n\t Parsed: {}".format(test, gold, parsed))

### add your code in the next cell

You should produce a list _parsed_ of the parsed email addresses.
When calling _validate_email(parsed_emails)_ it should only print CORRECT.

In [9]:
import re
import nltk

parsed_emails=[]

reg_exp = r'^[^\.][a-zA-Z0-9-!#$%&*+-/=?^_`{|}~\']+(\.|\s+[d][o][t]\s+|\([d][o][t]\)|\s+\([d][o][t]\)\s+)?[a-zA-Z0-9-!#$%&*+-/=?^_`{|}~\']+(\@|\s+[a][t]\s+|\([a][t]\)|\s+\([a][t]\)\s+)[a-zA-Z0-9-]+[^\@]+(\.|\s+[d][o][t]\s+|\([d][o][t]\)|\s+\([d][o][t]\)\s+)[a-zA-z]+[^\.]$'

for test in test_emails:
    if re.search(reg_exp,test):
        test = re.sub('\s+dot\s+|\(dot\)|\s+\(dot\)\s+', '.',(re.sub('(\s+at\s+|\(at\)|\s+\(at\)\s+)', '@',test)))
    else:
        test
    parsed_emails.append(test)

def validate_email(parsed_emails):
    for parsed, test, gold in zip(parsed_emails, test_emails, gold_emails):
        if parsed != gold:
            print("WRONG   → \n\t Input : {}\n\t Gold  : {}\n\t Parsed: {}".format(test, gold, parsed))
        else:
            print("CORRECT → \n\t Input : {}\n\t Gold  : {}\n\t Parsed: {}".format(test, gold, parsed))

validate_email(parsed_emails)

CORRECT → 
	 Input : peter.mueller@uni-passau.de
	 Gold  : peter.mueller@uni-passau.de
	 Parsed: peter.mueller@uni-passau.de
CORRECT → 
	 Input : peter dot mueller at uni-passau dot de
	 Gold  : peter.mueller@uni-passau.de
	 Parsed: peter.mueller@uni-passau.de
CORRECT → 
	 Input : peter.mueller(at)uni-passau.de
	 Gold  : peter.mueller@uni-passau.de
	 Parsed: peter.mueller@uni-passau.de
CORRECT → 
	 Input : peter.mueller (at) uni-passau.de
	 Gold  : peter.mueller@uni-passau.de
	 Parsed: peter.mueller@uni-passau.de
CORRECT → 
	 Input : nefullword@gw.uni-passau.de
	 Gold  : nefullword@gw.uni-passau.de
	 Parsed: nefullword@gw.uni-passau.de
CORRECT → 
	 Input : peter (dot) mueller (at) uni-passau (dot) de
	 Gold  : peter.mueller@uni-passau.de
	 Parsed: peter.mueller@uni-passau.de
CORRECT → 
	 Input : other.host@gmail.com
	 Gold  : other.host@gmail.com
	 Parsed: other.host@gmail.com
CORRECT → 
	 Input : special-chars (at) live.com
	 Gold  : special-chars@live.com
	 Parsed: special-chars@live