### Standardizing Data

In [67]:
import pandas as pd

### Scenario 1

In [68]:
data = {
    'student_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'full_name': ['Harry Potter', 'Hermione Granger', 'Ron Weasley', 'Draco Malfoy', 'Luna Lovegood', 'Neville Longbottom', 'Ginny Weasley', 'Fred Weasley', 'George Weasley', 'Cho Chang'],
    'phone_number': ['(123) 456-7890', '555-123-4567', '9876543210', '123.456.7890', '(123)-4567890', '987-654-3210', '123 456 7890', '1234567890', '+7890123456', '012-345-6789']
}

Bad_Numbers = pd.DataFrame(data)
Bad_Numbers

Unnamed: 0,student_id,full_name,phone_number
0,1,Harry Potter,(123) 456-7890
1,2,Hermione Granger,555-123-4567
2,3,Ron Weasley,9876543210
3,4,Draco Malfoy,123.456.7890
4,5,Luna Lovegood,(123)-4567890
5,6,Neville Longbottom,987-654-3210
6,7,Ginny Weasley,123 456 7890
7,8,Fred Weasley,1234567890
8,9,George Weasley,+7890123456
9,10,Cho Chang,012-345-6789


In [69]:
Bad_Numbers['phone_number'] = Bad_Numbers['phone_number'].str.replace(r'\D', '', regex=True)

In [70]:
Bad_Numbers

Unnamed: 0,student_id,full_name,phone_number
0,1,Harry Potter,1234567890
1,2,Hermione Granger,5551234567
2,3,Ron Weasley,9876543210
3,4,Draco Malfoy,1234567890
4,5,Luna Lovegood,1234567890
5,6,Neville Longbottom,9876543210
6,7,Ginny Weasley,1234567890
7,8,Fred Weasley,1234567890
8,9,George Weasley,7890123456
9,10,Cho Chang,123456789


In [71]:
#Make sure that all data are consistent

len(Bad_Numbers[Bad_Numbers['phone_number'].str.len() != 10])

0

In [72]:
# Change the format into dashes '-'

Bad_Numbers['phone_number'] = Bad_Numbers['phone_number'].str[:3] + '-' + Bad_Numbers['phone_number'].str[3:6] + '-' + Bad_Numbers['phone_number'].str[6:]

In [73]:
Bad_Numbers

Unnamed: 0,student_id,full_name,phone_number
0,1,Harry Potter,123-456-7890
1,2,Hermione Granger,555-123-4567
2,3,Ron Weasley,987-654-3210
3,4,Draco Malfoy,123-456-7890
4,5,Luna Lovegood,123-456-7890
5,6,Neville Longbottom,987-654-3210
6,7,Ginny Weasley,123-456-7890
7,8,Fred Weasley,123-456-7890
8,9,George Weasley,789-012-3456
9,10,Cho Chang,012-345-6789


### Scenario 2

In [74]:
data = {
    'Full Name': ['Harry Potter', 'Hermione Granger', 'Ron Weasley', 'Draco Malfoy', 'Luna Lovegood', 'Gregory Goyle'],
    'Voldemort_Bad': ['Y', 'Yes', 'Yes', 'No', 'yes', 'Nope!']
}

Voldy = pd.DataFrame(data)

Voldy

Unnamed: 0,Full Name,Voldemort_Bad
0,Harry Potter,Y
1,Hermione Granger,Yes
2,Ron Weasley,Yes
3,Draco Malfoy,No
4,Luna Lovegood,yes
5,Gregory Goyle,Nope!


In [75]:
Voldy.groupby('Voldemort_Bad').count()

Unnamed: 0_level_0,Full Name
Voldemort_Bad,Unnamed: 1_level_1
No,1
Nope!,1
Y,1
Yes,2
yes,1


In [76]:
Voldy['Voldemort_Bad'] = Voldy['Voldemort_Bad'].apply(lambda x:'Yes' if x.startswith(('Y','y')) else ('No' if x.startswith(('N','n')) else x))

### Scenario 3

In [77]:
data = {
    'First Name': ['Harry', 'Hermione', 'Ron', 'Hermione', 'Ronald', 'Harry', 'Hermion', 'Ron'],
    'Last Name': ['Potter', 'Granger', 'Weasley', 'Grangar', 'Weasly', 'Poter', 'Graner', 'Weaseley'],
    'Age': [18, 17, 18, 17, 18, 18, 17, 18],
    'Social Security Number': ['123-45-6789', '987-65-4321', '555-55-5555', '987-65-4321', '555-55-5555',
                               '123-45-6789', '987-65-4321', '555-55-5555']
}

Hogwarts_Students = pd.DataFrame(data)

Hogwarts_Students

Unnamed: 0,First Name,Last Name,Age,Social Security Number
0,Harry,Potter,18,123-45-6789
1,Hermione,Granger,17,987-65-4321
2,Ron,Weasley,18,555-55-5555
3,Hermione,Grangar,17,987-65-4321
4,Ronald,Weasly,18,555-55-5555
5,Harry,Poter,18,123-45-6789
6,Hermion,Graner,17,987-65-4321
7,Ron,Weaseley,18,555-55-5555


In [78]:
Hogwarts_Students['Same_Student_ID2'] = Hogwarts_Students.groupby('Social Security Number').ngroup()

In [79]:
Hogwarts_Students

Unnamed: 0,First Name,Last Name,Age,Social Security Number,Same_Student_ID2
0,Harry,Potter,18,123-45-6789,0
1,Hermione,Granger,17,987-65-4321,2
2,Ron,Weasley,18,555-55-5555,1
3,Hermione,Grangar,17,987-65-4321,2
4,Ronald,Weasly,18,555-55-5555,1
5,Harry,Poter,18,123-45-6789,0
6,Hermion,Graner,17,987-65-4321,2
7,Ron,Weaseley,18,555-55-5555,1


In [80]:
Hogwarts_Students['Name_Substring'] = Hogwarts_Students['First Name'].str[:3] + Hogwarts_Students['Last Name'].str[:3]

In [81]:
Hogwarts_Students

Unnamed: 0,First Name,Last Name,Age,Social Security Number,Same_Student_ID2,Name_Substring
0,Harry,Potter,18,123-45-6789,0,HarPot
1,Hermione,Granger,17,987-65-4321,2,HerGra
2,Ron,Weasley,18,555-55-5555,1,RonWea
3,Hermione,Grangar,17,987-65-4321,2,HerGra
4,Ronald,Weasly,18,555-55-5555,1,RonWea
5,Harry,Poter,18,123-45-6789,0,HarPot
6,Hermion,Graner,17,987-65-4321,2,HerGra
7,Ron,Weaseley,18,555-55-5555,1,RonWea


In [82]:
Hogwarts_Students['Same_Student_ID'] = Hogwarts_Students.groupby('Name_Substring').ngroup() + 1

In [83]:
Hogwarts_Students

Unnamed: 0,First Name,Last Name,Age,Social Security Number,Same_Student_ID2,Name_Substring,Same_Student_ID
0,Harry,Potter,18,123-45-6789,0,HarPot,1
1,Hermione,Granger,17,987-65-4321,2,HerGra,2
2,Ron,Weasley,18,555-55-5555,1,RonWea,3
3,Hermione,Grangar,17,987-65-4321,2,HerGra,2
4,Ronald,Weasly,18,555-55-5555,1,RonWea,3
5,Harry,Poter,18,123-45-6789,0,HarPot,1
6,Hermion,Graner,17,987-65-4321,2,HerGra,2
7,Ron,Weaseley,18,555-55-5555,1,RonWea,3


### Scenario 4

In [86]:
data = {
    'First Name': ['Harry', 'Hermione?', 'Ron', 'Hermione', 'Ron_', 'Harry!', 'Hermione', 'Ron'],
    'Last Name': ['"Potter"', 'Granger', 'Weasley', 'Granger', 'Weasley...', 'Potter', 'Granger', 'Weasley'],
    'Age': [18, 17, 18, 17, 18, 18, 17, 18]
}

Hogwarts_Students2 = pd.DataFrame(data)

Hogwarts_Students2

Unnamed: 0,First Name,Last Name,Age
0,Harry,"""Potter""",18
1,Hermione?,Granger,17
2,Ron,Weasley,18
3,Hermione,Granger,17
4,Ron_,Weasley...,18
5,Harry!,Potter,18
6,Hermione,Granger,17
7,Ron,Weasley,18


In [91]:
# Cleaning data using Regex

Hogwarts_Students2['First Name'] = Hogwarts_Students2['First Name'].str.replace(r'[^a-zA-Z]','', regex = True)
Hogwarts_Students2['Last Name'] = Hogwarts_Students2['Last Name'].str.replace(r'[^a-zA-Z]','', regex = True)

In [92]:
Hogwarts_Students2

Unnamed: 0,First Name,Last Name,Age
0,Harry,Potter,18
1,Hermione,Granger,17
2,Ron,Weasley,18
3,Hermione,Granger,17
4,Ron,Weasley,18
5,Harry,Potter,18
6,Hermione,Granger,17
7,Ron,Weasley,18
