## Data Cleaning



In [1]:
import pandas as pd
import numpy as np

df = pd.read_table('balance.txt', delim_whitespace=True)


### Dropping Unnecessary Columns

In [2]:
df.drop(['Limit','Age'],inplace=True, axis=1)

In [3]:
df.head()

Unnamed: 0,Balance,Income,Rating,Cards,Education,Gender,Student,Married,Ethnicity
0,12.240798,14.891,283,2,11,Male,No,Yes,Caucasian
1,23.283334,106.025,483,3,15,Female,Yes,Yes,Asian
2,22.530409,104.593,514,4,11,Male,No,No,Asian
3,27.652811,148.924,681,3,11,Female,No,No,Asian
4,16.893978,55.882,357,2,16,Male,No,Yes,Caucasian


### Replace values

Sometimes you would like to replace a value from your data set with another value. For example if you had data with categories such as ‘Ethnicity’ and we wanted to rename one category lets say, 'African American' to 'African'.

In [4]:
df.replace('African American','African').head(10)

Unnamed: 0,Balance,Income,Rating,Cards,Education,Gender,Student,Married,Ethnicity
0,12.240798,14.891,283,2,11,Male,No,Yes,Caucasian
1,23.283334,106.025,483,3,15,Female,Yes,Yes,Asian
2,22.530409,104.593,514,4,11,Male,No,No,Asian
3,27.652811,148.924,681,3,11,Female,No,No,Asian
4,16.893978,55.882,357,2,16,Male,No,Yes,Caucasian
5,22.486178,80.18,569,4,10,Male,No,No,Caucasian
6,10.574516,20.996,259,2,12,Female,No,No,African
7,14.576204,71.408,512,2,9,Male,No,No,Asian
8,7.93809,15.125,266,5,13,Female,No,No,Caucasian
9,17.756965,71.061,491,3,19,Female,Yes,Yes,African


### Grouping Data

In [5]:
grouped = df.groupby('Ethnicity')
grouped.get_group('Asian').head()

Unnamed: 0,Balance,Income,Rating,Cards,Education,Gender,Student,Married,Ethnicity
1,23.283334,106.025,483,3,15,Female,Yes,Yes,Asian
2,22.530409,104.593,514,4,11,Male,No,No,Asian
3,27.652811,148.924,681,3,11,Female,No,No,Asian
7,14.576204,71.408,512,2,9,Male,No,No,Asian
12,19.2188,80.616,394,1,7,Female,No,Yes,Asian


### Dealing with inconsistent data entry

In [6]:

# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

# set seed for reproducibility
np.random.seed(0)



In [7]:
suicide_attacks = pd.read_csv("Pakistan.csv", encoding = 'unicode_escape')
suicide_attacks.head()

Unnamed: 0,S#,Date,Islamic Date,Blast Day Type,Holiday Type,Time,City,Latitude,Longitude,Province,...,Targeted Sect if any,Killed Min,Killed Max,Injured Min,Injured Max,No. of Suicide Blasts,Explosive Weight (max),Hospital Names,Temperature(C),Temperature(F)
0,1,November 19-1995,25 Jumaada al-THaany 1416 A.H,Holiday,Weekend,,Islamabad,33.718,73.0718,Capital,...,,14.0,15.0,,60,2.0,,,15.835,60.503
1,2,November 6-2000,10 SHa`baan 1421 A.H,Working Day,,,Karachi,24.9918,66.9911,Sindh,...,,,3.0,,3,1.0,,,23.77,74.786
2,3,May 8-2002,25 safar 1423 A.H,Working Day,,7:45 AM,Karachi,24.9918,66.9911,Sindh,...,Christian,13.0,15.0,20.0,40,1.0,2.5 Kg,1.Jinnah Postgraduate Medical Center 2. Civil ...,31.46,88.628
3,4,June 14-2002,3 Raby` al-THaany 1423 A.H,Working Day,,11:10:00 AM,Karachi,24.9918,66.9911,Sindh,...,Christian,,12.0,,51,1.0,,,31.43,88.574
4,5,July 4-2003,4 Jumaada al-awal 1424 A.H,Working Day,,,Quetta,30.2095,67.0182,Baluchistan,...,Shiite,44.0,47.0,,65,1.0,,1.CMH Quetta \n2.Civil Hospital 3. Boland Medi...,33.12,91.616


#### Text pre-processing

In [8]:
cities = suicide_attacks['City'].unique()
# sort them alphabetically and then take a closer look
cities.sort()
cities

array(['ATTOCK', 'Attock ', 'Bajaur Agency', 'Bannu', 'Bhakkar ', 'Buner',
       'Chakwal ', 'Chaman', 'Charsadda', 'Charsadda ', 'D. I Khan',
       'D.G Khan', 'D.G Khan ', 'D.I Khan', 'D.I Khan ', 'Dara Adam Khel',
       'Dara Adam khel', 'Fateh Jang', 'Ghallanai, Mohmand Agency ',
       'Gujrat', 'Hangu', 'Haripur', 'Hayatabad', 'Islamabad',
       'Islamabad ', 'Jacobabad', 'KURRAM AGENCY', 'Karachi', 'Karachi ',
       'Karak', 'Khanewal', 'Khuzdar', 'Khyber Agency', 'Khyber Agency ',
       'Kohat', 'Kohat ', 'Kuram Agency ', 'Lahore', 'Lahore ',
       'Lakki Marwat', 'Lakki marwat', 'Lasbela', 'Lower Dir', 'MULTAN',
       'Malakand ', 'Mansehra', 'Mardan', 'Mohmand Agency',
       'Mohmand Agency ', 'Mohmand agency', 'Mosal Kor, Mohmand Agency',
       'Multan', 'Muzaffarabad', 'North Waziristan', 'North waziristan',
       'Nowshehra', 'Orakzai Agency', 'Peshawar', 'Peshawar ', 'Pishin',
       'Poonch', 'Quetta', 'Quetta ', 'Rawalpindi', 'Sargodha',
       'Sehwan town',

Fixing some inconsistency issues

In [9]:
# convert to lower case
suicide_attacks['City'] = suicide_attacks['City'].str.lower()

# remove trailing white spaces
suicide_attacks['City'] = suicide_attacks['City'].str.strip()

# Let us view the data

cities = suicide_attacks['City'].unique()
# sort them alphabetically and then take a closer look
cities.sort()
cities

array(['attock', 'bajaur agency', 'bannu', 'bhakkar', 'buner', 'chakwal',
       'chaman', 'charsadda', 'd. i khan', 'd.g khan', 'd.i khan',
       'dara adam khel', 'fateh jang', 'ghallanai, mohmand agency',
       'gujrat', 'hangu', 'haripur', 'hayatabad', 'islamabad',
       'jacobabad', 'karachi', 'karak', 'khanewal', 'khuzdar',
       'khyber agency', 'kohat', 'kuram agency', 'kurram agency',
       'lahore', 'lakki marwat', 'lasbela', 'lower dir', 'malakand',
       'mansehra', 'mardan', 'mohmand agency',
       'mosal kor, mohmand agency', 'multan', 'muzaffarabad',
       'north waziristan', 'nowshehra', 'orakzai agency', 'peshawar',
       'pishin', 'poonch', 'quetta', 'rawalpindi', 'sargodha',
       'sehwan town', 'shabqadar-charsadda', 'shangla', 'shikarpur',
       'sialkot', 'south waziristan', 'sudhanoti', 'sukkur', 'swabi',
       'swat', 'taftan', 'tangi, charsadda district', 'tank', 'taunsa',
       'tirah valley', 'totalai', 'upper dir', 'wagah', 'zhob'],
      dtype=

Fuzzy matching to find text strings that are similar to each other

In [10]:
# get the top 10 closest matches to "d.i khan"
matches = fuzzywuzzy.process.extract("d.i khan", cities, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

# take a look at them
matches

[('d. i khan', 100),
 ('d.i khan', 100),
 ('d.g khan', 88),
 ('khanewal', 50),
 ('sudhanoti', 47),
 ('hangu', 46),
 ('kohat', 46),
 ('dara adam khel', 45),
 ('chaman', 43),
 ('mardan', 43)]

In [11]:
# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 90):
   
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")

In [12]:
# use the function we just wrote to replace close matches to 
# "d.i khan" with "d.i khan"
replace_matches_in_column(df=suicide_attacks, column='City', string_to_match="d.i khan")

All done!


Checking the unique values in our City column again and make sure we've cleaned up 'd.i khan' correctly



In [13]:
# get all the unique values in the 'City' column
cities = suicide_attacks['City'].unique()

# sort them alphabetically and then take a closer look
cities.sort()
cities


array(['attock', 'bajaur agency', 'bannu', 'bhakkar', 'buner', 'chakwal',
       'chaman', 'charsadda', 'd.g khan', 'd.i khan', 'dara adam khel',
       'fateh jang', 'ghallanai, mohmand agency', 'gujrat', 'hangu',
       'haripur', 'hayatabad', 'islamabad', 'jacobabad', 'karachi',
       'karak', 'khanewal', 'khuzdar', 'khyber agency', 'kohat',
       'kuram agency', 'kurram agency', 'lahore', 'lakki marwat',
       'lasbela', 'lower dir', 'malakand', 'mansehra', 'mardan',
       'mohmand agency', 'mosal kor, mohmand agency', 'multan',
       'muzaffarabad', 'north waziristan', 'nowshehra', 'orakzai agency',
       'peshawar', 'pishin', 'poonch', 'quetta', 'rawalpindi', 'sargodha',
       'sehwan town', 'shabqadar-charsadda', 'shangla', 'shikarpur',
       'sialkot', 'south waziristan', 'sudhanoti', 'sukkur', 'swabi',
       'swat', 'taftan', 'tangi, charsadda district', 'tank', 'taunsa',
       'tirah valley', 'totalai', 'upper dir', 'wagah', 'zhob'],
      dtype=object)

### Working with Data and time



In [14]:
# modules we'll use
#import seaborn as sns
from datetime import date
# print the first few rows of the date column
print(suicide_attacks['Date'].head())


0    November 19-1995
1     November 6-2000
2          May 8-2002
3        June 14-2002
4         July 4-2003
Name: Date, dtype: object


In [15]:
# check the data type of our date column
suicide_attacks['Date'].dtype

dtype('O')

### Convert our date columns to datetime



In [16]:
# create a new column, date_parsed, with the parsed dates


suicide_attacks['date_parsed'] = pd.to_datetime(suicide_attacks['Date'], format='%B %d-%Y')


suicide_attacks['date_parsed'].head()


0   1995-11-19
1   2000-11-06
2   2002-05-08
3   2002-06-14
4   2003-07-04
Name: date_parsed, dtype: datetime64[ns]

In [17]:
suicide_attacks

Unnamed: 0,S#,Date,Islamic Date,Blast Day Type,Holiday Type,Time,City,Latitude,Longitude,Province,...,Killed Min,Killed Max,Injured Min,Injured Max,No. of Suicide Blasts,Explosive Weight (max),Hospital Names,Temperature(C),Temperature(F),date_parsed
0,1,November 19-1995,25 Jumaada al-THaany 1416 A.H,Holiday,Weekend,,islamabad,33.718000,73.0718,Capital,...,14.0,15.0,,60,2.0,,,15.835,60.503,1995-11-19
1,2,November 6-2000,10 SHa`baan 1421 A.H,Working Day,,,karachi,24.991800,66.9911,Sindh,...,,3.0,,3,1.0,,,23.770,74.786,2000-11-06
2,3,May 8-2002,25 safar 1423 A.H,Working Day,,7:45 AM,karachi,24.991800,66.9911,Sindh,...,13.0,15.0,20.0,40,1.0,2.5 Kg,1.Jinnah Postgraduate Medical Center 2. Civil ...,31.460,88.628,2002-05-08
3,4,June 14-2002,3 Raby` al-THaany 1423 A.H,Working Day,,11:10:00 AM,karachi,24.991800,66.9911,Sindh,...,,12.0,,51,1.0,,,31.430,88.574,2002-06-14
4,5,July 4-2003,4 Jumaada al-awal 1424 A.H,Working Day,,,quetta,30.209500,67.0182,Baluchistan,...,44.0,47.0,,65,1.0,,1.CMH Quetta \n2.Civil Hospital 3. Boland Medi...,33.120,91.616,2003-07-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,492,October 05-2017,13 MuHarram 1439 A.H,Working Day,,,quetta,28.571051,67.496895,Balochistan,...,21.0,22.0,25.0,,1.0,,"Larkana Hospital, Gandawah District Headquarte...",27.800,81.000,2017-10-05
492,493,October 19-2017,27 MuHarram 1439 A.H,Working Day,,8:00 AM,quetta,,,Balochistan,...,7.0,7.0,22.0,22,1.0,,,26.000,79.000,2017-10-19
493,494,November 9-2017,19 Safar 1439 A.H,Working Day,,7:00 AM,quetta,30.221057,67.002524,Balochistan,...,2.0,2.0,8.0,8,1.0,15KG,"Combined Military Hospital (CMH)-(Quetta), Civ...",25.000,77.000,2017-11-09
494,495,November 24-2017,5 Rabi Al-Awwal 1439 A.H,Working Day,,7:00 AM,peshawar,33.970623,71.43862,KPK,...,1.0,3.0,6.0,8,1.0,20KG,Hayatabad Medical Complex(Peshawar),22.000,72.000,2017-11-24
