# Part 8

# Duplicates in data frames

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
dframe = DataFrame({'col1': ['A'] * 2 + ['B'] * 3, # same as typing ['A','A','B','B','B']
                  'col2': [2, 2, 2, 3, 3]})
dframe

Unnamed: 0,col1,col2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [3]:
dframe.duplicated() # find duplicates - exactly the same as the function duplicated() in R

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [4]:
dframe.duplicated().sum()

2

In [9]:
dframe.drop_duplicates() # drop duplicates - by default the first value is retained

Unnamed: 0,col1,col2
0,A,2
2,B,2
3,B,3


In [5]:
dframe.drop_duplicates(keep = 'last') # keep the last value - although it makes no difference here

Unnamed: 0,col1,col2
1,A,2
2,B,2
4,B,3


Note that `keep='last'` replaces `keep_last=True` (from earlier versions)

In [6]:
dframe.drop_duplicates(['col1']) # drop duplicates based on a specific column - retaining the first value

Unnamed: 0,col1,col2
0,A,2
2,B,2


In [12]:
dframe.drop_duplicates(['col1'],keep = 'last') # retain the last value of each set of duplicates

Unnamed: 0,col1,col2
1,A,2
4,B,3


In [7]:
dframe1 = DataFrame({'key1': ['A'] * 2 + ['B'] * 3,'key2': [2, 2, 2, 3, 3],'key3':[1,2,3,3,5]})
dframe1

Unnamed: 0,key1,key2,key3
0,A,2,1
1,A,2,2
2,B,2,3
3,B,3,3
4,B,3,5


In [8]:
dframe1.drop_duplicates(['key1','key2']).reset_index()

Unnamed: 0,index,key1,key2,key3
0,0,A,2,1
1,2,B,2,3
2,3,B,3,3


# Mapping

In [12]:
capitals = DataFrame({'city':['Beijing','New Delhi','Tokyo','Manila','Moscow','New City'],
                    'population':[20693000,16787949,13189000,12877253,11541000,1000]})
capitals

Unnamed: 0,city,population
0,Beijing,20693000
1,New Delhi,16787949
2,Tokyo,13189000
3,Manila,12877253
4,Moscow,11541000
5,New City,1000


In [13]:
country_map = {'Beijing':'China','New Delhi':'India','Cairo':'Egypt',
             'Tokyo':'Japan','Jakarta':'Indonesia','Moscow':'Russia',
               'Seoul':'South Korea','Manila':'Philippines'}

In [14]:
capitals['country'] = capitals['city'].map(country_map) 
# creates a new column 'state' and fills in values based on 'town' by looking up 'town' in the dict 'state_map'
capitals

Unnamed: 0,city,population,country
0,Beijing,20693000,China
1,New Delhi,16787949,India
2,Tokyo,13189000,Japan
3,Manila,12877253,Philippines
4,Moscow,11541000,Russia
5,New City,1000,


# Outliers

In [5]:
import numpy as np
import pandas as pd
from numpy.random import randn # optional - otherwise type np.random.randn()

In [16]:
np.random.seed(123)
randn(3)

array([-1.0856306 ,  0.99734545,  0.2829785 ])

In [18]:
randn(7,3)

array([[-0.01183049,  2.39236527,  0.41291216],
       [ 0.97873601,  2.23814334, -1.29408532],
       [-1.03878821,  1.74371223, -0.79806274],
       [ 0.02968323,  1.06931597,  0.89070639],
       [ 1.75488618,  1.49564414,  1.06939267],
       [-0.77270871,  0.79486267,  0.31427199],
       [-1.32626546,  1.41729905,  0.80723653]])

In [6]:
np.random.seed(12345)
dframe = pd.DataFrame(randn(1000,4)) # 1000 rows and 4 columns
dframe.head()

Unnamed: 0,0,1,2,3
0,-0.204708,0.478943,-0.519439,-0.55573
1,1.965781,1.393406,0.092908,0.281746
2,0.769023,1.246435,1.007189,-1.296221
3,0.274992,0.228913,1.352917,0.886429
4,-2.001637,-0.371843,1.669025,-0.43857


In [20]:
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [21]:
dframe.describe(percentiles = [0.01,0.05,0.87,0.95,0.99])

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
1%,-2.253032,-2.311638,-2.241267,-2.264985
5%,-1.654448,-1.596145,-1.613868,-1.595315
50%,-0.116401,0.101143,0.002073,-0.013611
87%,1.06467,1.214509,1.170963,1.107758
95%,1.634802,1.620198,1.676643,1.653155
99%,2.300858,2.259621,2.338175,2.242728


In [22]:
np.arange(0.1,1.0,0.1)

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [23]:
dframe.describe(percentiles=np.arange(0.1,1.0,0.1)) # instead of typing percentiles = [0.1,0.2,0.3,...]

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
10%,-1.294593,-1.244422,-1.255581,-1.252993
20%,-0.922123,-0.749115,-0.816874,-0.828466
30.0%,-0.610044,-0.459042,-0.516017,-0.521751
40%,-0.353629,-0.177329,-0.25126,-0.276209
50%,-0.116401,0.101143,0.002073,-0.013611
60%,0.178808,0.362091,0.285412,0.248915


In [7]:
col = dframe[0] # just select the first column and save it as 'col'
col.head()

0   -0.204708
1    1.965781
2    0.769023
3    0.274992
4   -2.001637
Name: 0, dtype: float64

In [8]:
col[col > 3]

900    3.366626
Name: 0, dtype: float64

In [9]:
col[np.abs(col) > 3]

523   -3.428254
900    3.366626
Name: 0, dtype: float64

In [10]:
dframe[(np.abs(dframe) > 3).any(1)] # returns all rows in which any column has an absolute value > 3

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [11]:
# fix all values > 3 to 3
dframe[dframe > 3] = 3

In [12]:
# fix all values < -3 to -3
dframe[dframe < -3] = -3

## Another Approach :

### Detecting outliers via the IQR technique and replacing the lower and higher outliers with the end points

In [13]:
# finding the interquartile range for every column
Q1 = dframe.quantile(0.25)
Q3 = dframe.quantile(0.75)
# finding values Q1 - 1.5*IQR and Q3 + 1.5*IQR
IQR  = Q3 - Q1
# set all values < Q1 - 1.5*IQR w
lower_band = Q1 - 1.5*IQR
upper_band = Q3 + 1.5*IQR

In [14]:
print(lower_band)
print(upper_band)

0   -2.861773
1   -2.650026
2   -2.624774
3   -2.591853
dtype: float64
0    2.703249
1    2.838466
2    2.663491
3    2.602037
dtype: float64


In [15]:
for i in range(3):
    dframe.loc[dframe[i]<lower_band[i],[i]] = lower_band[i]
    dframe.loc[dframe[i]<upper_band[i],[i]] = upper_band[i]

In [16]:
dframe.loc[dframe[0]<-2.861773,[0]]

Unnamed: 0,0


## End of part 8