In [156]:
import numpy as np 
import pandas as pd 

## Removing Duplicates

In [157]:
data = pd.DataFrame({'k1': ["one", "two"] * 3 + ["two"], 'k2' : [1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [158]:
# the dataFrame method duplicated returns a Boolean Series indicating whether each row is a duplicate (its column values are exactly equal to those in an earlier row) or not
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [159]:
# drop_duplicates returns a DataFrame with rows where the duplicated array is False filtered out
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [160]:
# both methods by default consider all of the columns

# to specify any subset of columns to detect duplicates
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [161]:
# to filter duplicates based on the "k1" column
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [162]:
# duplicated and drop_duplicates by default keep the first observed value combination
# passking keep="last" will return the last one

data.drop_duplicates(["k1", "k2"], keep="last")

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## [ Transforming Data Using a Function or Mapping ]
for many datasets, you wish to perform some transformation based on the values in an array, Series, or column in a DataFrame

In [163]:
# consider 
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon", "pastrami", "corned beef", "bacon", "pastrami", "honey ham", "nova lox"],
                     "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [164]:
# suppose you want to add a column indicating the type of animal that each food came from. 
# let's write down a mapping of each distinct meat type 

meat_to_animal = {
    "bacon": "pig",
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox": "salmon"
}

# the map method on Series accepts a function or dictionary-like object containing a mapping to do the transforming of values

data["animal"] = data["food"].map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [165]:
# could also have passed a function that does all the work
def get_animal(x):
    return meat_to_animal[x]
data["food"].map(get_animal)


# using map is a convenient way to perform element-wise transformations and other data cleaning-related operations

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

## [ Replacing Values ]
- filling in missing data with the `fillna` method is a special case of more general value replacement.
- we have already seen, map can be used to modify a subset of values in an object.
- but `replace` provides a simpler and more flexible way to do 

In [166]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [167]:
# the -999 values might be sentinel values for missing data
# to replace these with NA values that pandas understand, we can replace, producing a new Series

data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [168]:
# want to replace multiple values at once, you instead pass a list and then the substitute value
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [169]:
# to use a different replacement for each value, pass a list of substitutes
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [170]:
# the argument passed can also be a dictionary
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## [ Renaming Axis Indexes ]
- like values in a Series, axis labels can be similarly transformed by a function or mappping of some form to produce new, differently labeled objects.
- we can also modify the axes in place without creating a new data structure

In [171]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                   index=["Ohio", "Colorado", "New York"],
                   columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [172]:
# like the Series, the axis indexes have a map method

def transform(x):
    return x[:4].upper()
data.index.map(transform) # original dataframe is not changed

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [173]:
# we can assign to the index attribute, modifying the DataFrame in place
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [174]:
# if you want to create a transformed version of a dataset without modifying the original, a useful method is `rename`
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [175]:
# `rename` can be used in conjunction with a dictionary-like object, providing new values for a subset of the axis labels
data.rename(index={"OHIO" : "INDIANA"}, columns={"three" : 3})

# rename saves you from the chore of copying the DataFrame manually and assigning new values to its index and columns attributes

Unnamed: 0,one,two,3,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## [ Discretization and Binning ]

**Discretization** and **binning** are techniques used to **convert continuous values into discrete categories** or bins (i.e., ranges or groups).

They are especially useful when:
- You want to **simplify** numeric data
- You want to **group continuous values** into intervals (like age groups, income brackets, etc.)
- You want to **analyze frequency** within ranges

####  Two Main Types:

| Method | Description |
|--------|-------------|
| `pd.cut()` | Bins continuous data into **equal ranges** or **custom ranges** |
| `pd.qcut()` | Bins data into **equal-sized quantiles** (e.g., 25%, 50%) |

#### Use Cases:
- Creating age groups
- Income classification
- Normalizing continuous data into groups
- Feature engineering for machine learning


pandas.cut()

In [176]:
# suppose having data about a group of people in a study, and you want to group them into discrete age buckets
age = [20,22,25,27,21,23,37,31,61,45,41,32]

# divide these into bins of 18-25, 26-35, 36-60, 61-100. for this use pandas.cut
bins = [18, 25, 35, 60, 100]
age_categories = pd.cut(age, bins)
age_categories

# the object pandas returns is a special Categorial object. The output you see describes the bins computed by pandas.cut
# each bin is identified by a special (unique to pandas) interval value type containing the lower and upper limit of each bin

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [177]:
age_categories.codes # index of the intervals (categories)

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [178]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [179]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [180]:
# pd.value_counts(age_categories)
pd.Series(age_categories).value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [181]:
# to change which side is closed
pd.cut(age, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [182]:
# we can override the default interval based bin labeling by passing a list or array to the labels option
group_names = ["youth", "youngAdult", "middleAged", "senior"]
pd.cut(age, bins, labels=group_names)

['youth', 'youth', 'youth', 'youngAdult', 'youth', ..., 'youngAdult', 'senior', 'middleAged', 'middleAged', 'youngAdult']
Length: 12
Categories (4, object): ['youth' < 'youngAdult' < 'middleAged' < 'senior']

In [183]:
# passing an integer to cut() auto-creates that many equal-width bins
# it calculates the range from your data

data = np.random.uniform(size=20)
print(data)
pd.cut(data, 4, precision=2)

# 20 random numbers are created btw 0 and 1
# NOW

# 4 bins will be created . 
# HOW ?
# range = max_value - min_value
# 4 bins -> each bin width = range / 4

# precision just controls how tidy the bin edges look

# this is helpful when you want to group continuous data into intervals quickly without manually setting the bin edges

[0.34986466 0.99624829 0.4675396  0.01780349 0.82059702 0.96119254
 0.936564   0.44182709 0.69766545 0.91505564 0.60507726 0.06415149
 0.51663325 0.96936042 0.25156827 0.4124847  0.96764158 0.32974571
 0.51815293 0.01482479]


[(0.26, 0.51], (0.75, 1.0], (0.26, 0.51], (0.014, 0.26], (0.75, 1.0], ..., (0.26, 0.51], (0.75, 1.0], (0.26, 0.51], (0.51, 0.75], (0.014, 0.26]]
Length: 20
Categories (4, interval[float64, right]): [(0.014, 0.26] < (0.26, 0.51] < (0.51, 0.75] < (0.75, 1.0]]

pandas.qcut()

In [184]:
# pd.qcut() bins the data based on the sample quantiles, we will obtain roughly equally sized bins
data = np.random.standard_normal(1000)
quartiles = pd.qcut(data, 4, precision=2)
quartiles

[(-0.65, -0.057], (-0.65, -0.057], (0.65, 3.14], (-3.21, -0.65], (-3.21, -0.65], ..., (-0.057, 0.65], (0.65, 3.14], (-0.057, 0.65], (-3.21, -0.65], (-0.65, -0.057]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.21, -0.65] < (-0.65, -0.057] < (-0.057, 0.65] < (0.65, 3.14]]

In [185]:
# checking the size of each bins
pd.Series(quartiles).value_counts()



# these discretization functions are especially usefull for quantile and group analysis

(-3.21, -0.65]     250
(-0.65, -0.057]    250
(-0.057, 0.65]     250
(0.65, 3.14]       250
Name: count, dtype: int64

## [ Detecting and Filtering Outliers ]
filtering or transforming outliers is largely a matter of applying array operations. 

In [186]:
# dataFrame with some normally distributed data
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
print(data)
data.describe()

            0         1         2         3
0   -0.366680 -0.079901 -0.579098 -0.833635
1    0.751502  0.433385 -0.897914 -0.269655
2    0.587647  1.538470 -0.158245 -0.477226
3    0.758797  0.726703 -0.655152  2.054585
4    1.626471  0.472248 -1.121916  2.263416
..        ...       ...       ...       ...
995 -0.358556 -0.324744 -1.575339  0.753349
996  1.930921  1.693801 -0.587436  1.005333
997  0.922524 -1.397946  0.788041  0.209193
998 -0.543092 -2.263215  1.222638 -1.054894
999 -0.549260 -0.571303  0.108299 -0.840642

[1000 rows x 4 columns]


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.025779,-0.002596,-0.025344,0.026487
std,0.984716,1.031646,0.989751,0.959183
min,-3.202405,-2.972267,-2.882396,-3.409465
25%,-0.691551,-0.685292,-0.75062,-0.572921
50%,-0.039555,0.052189,-0.053131,0.00811
75%,0.640297,0.700712,0.705996,0.677923
max,3.487545,3.314429,3.368643,2.799059


In [187]:
# suppose you wanted to find values in one of the columns exceeding 3 in absolute value
col = data[2]
col[col.abs() > 3]

900    3.368643
Name: 2, dtype: float64

In [188]:
# to select all rows having a value exceeding 3 or -3, use the `any` method on a Boolean DataFrame
data[(data.abs() > 3).any(axis="columns")]  # look across the columns 

Unnamed: 0,0,1,2,3
251,0.130381,-0.439496,0.360067,-3.409465
347,3.487545,0.147017,1.778852,-1.72086
646,-3.202405,-2.075126,0.816922,-0.266297
752,0.824879,3.162424,-0.521853,-1.159568
826,-0.161952,3.314429,-0.022731,0.937177
900,-0.384625,0.263058,3.368643,1.310124
903,0.754945,3.06532,-1.257313,0.761659
935,0.186696,3.115554,-0.018762,-0.932625


In [193]:
# how to cap values outside the interval -3 to 3
data[data.abs() > 3] = np.sign(data) * 3
data.describe()

# clip or clap values in a dataFrame

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.026064,-0.003254,-0.025713,0.026897
std,0.98246,1.029658,0.988553,0.957801
min,-3.0,-2.972267,-2.882396,-3.0
25%,-0.691551,-0.685292,-0.75062,-0.572921
50%,-0.039555,0.052189,-0.053131,0.00811
75%,0.640297,0.700712,0.705996,0.677923
max,3.0,3.0,3.0,2.799059


In [190]:
# np.sign() produces 0,-1,1 values based on whether the values in data are positive or negative
np.sign(data)

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,-1.0,-1.0
1,1.0,1.0,-1.0,-1.0
2,1.0,1.0,-1.0,-1.0
3,1.0,1.0,-1.0,1.0
4,1.0,1.0,-1.0,1.0
...,...,...,...,...
995,-1.0,-1.0,-1.0,1.0
996,1.0,1.0,-1.0,1.0
997,1.0,-1.0,1.0,1.0
998,-1.0,-1.0,1.0,-1.0


## [ Permutation and Random Sampling ]
permutation means randomly rearranging the order of elements (e.g., rows of a DataFrame)

In [196]:
# calling permutation with the length of the axis you want to permute produces an array of integers indicating the new ordering

# when we use np.random.permutation(n)
# numpy will: 
    # take integers from 0 to n-1
    # shuffle (permute) them randomly
    # return the new order as an array of indices

# we can use this to shuffle rows in a DataFrame or elements in a NumPy array

df = pd.DataFrame(np.arange(5*7).reshape((5,7)))
print(df)

sampler = np.random.permutation(5)
print(sampler)

    0   1   2   3   4   5   6
0   0   1   2   3   4   5   6
1   7   8   9  10  11  12  13
2  14  15  16  17  18  19  20
3  21  22  23  24  25  26  27
4  28  29  30  31  32  33  34
[0 2 1 4 3]


In [197]:
# now use that with: 
    # .iloc[] for positional indexing
    # .take() which works similarly but is more low-level and faster for large arrays

df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13
4,28,29,30,31,32,33,34
3,21,22,23,24,25,26,27


In [198]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13
4,28,29,30,31,32,33,34
3,21,22,23,24,25,26,27


In [200]:
# by invoking take with axis="columns", we could also select a permutation of the columns
column_sampler = np.random.permutation(7)
print(column_sampler)

df.take(column_sampler, axis="columns")

[6 1 2 0 4 5 3]


Unnamed: 0,6,1,2,0,4,5,3
0,6,1,2,0,4,5,3
1,13,8,9,7,11,12,10
2,20,15,16,14,18,19,17
3,27,22,23,21,25,26,24
4,34,29,30,28,32,33,31


In [201]:
# to select a random subset without replacement (the same row can't appear twice) you can use the sample method on Series and DatFrame
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27


In [204]:
# to generate a sample with replacement (to allow repeat choices), pass replace=True to sample
df.sample(n=13, replace=True)

Unnamed: 0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13
3,21,22,23,24,25,26,27
2,14,15,16,17,18,19,20
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
4,28,29,30,31,32,33,34
0,0,1,2,3,4,5,6


## [ Computing Indicator/Dummy Variables ]
- Dummy variables are binary (0 or 1) variables created to represent categorical data numerically
- They're useful when you want to use categorical data in machine learning or statistical models, which typically only accept numbers
- **Why Use Them?** - Because models like linear regression, decision trees, etc., can’t directly handle strings or categories like "red", "blue", "green", so we convert these into separate binary columns


In [206]:
# if a column in a DataFrame has k distinct values, you would derive a matrix or DataFrame with k columns containing all 1s and 0s. 
# pandas has a pd.get_dummies function for doing this

df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"], "data1": range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [210]:
pd.get_dummies(df['key'], dtype=int)

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [212]:
# to add a prefix to the columns in the indicator DataFrame which can be merged with other data, use prefix argument
dummies = pd.get_dummies(df["key"], prefix='key', dtype=int)
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [213]:
# merging
df_with_dummies = df.join(dummies)
df_with_dummies

Unnamed: 0,key,data1,key_a,key_b,key_c
0,b,0,0,1,0
1,b,1,0,1,0
2,a,2,1,0,0
3,c,3,0,0,1
4,a,4,1,0,0
5,b,5,0,1,0


In [217]:
# if a row in DataFrame belongs to multiple categories, we have to use a different approach to create the dummy variables

