# Data Transformation in Pandas

Hi Guys, Welcome to [Be.Analyst](https://youtube.com/@Be.Analyst) 😀
</br>
In this notebook, I'm going to show data transformation in Pandas.
</br>
Happy Learning 🐱‍🏍 

In [1]:
import pandas as pd

In [2]:
data=pd.DataFrame({"a":["one","two"]*3,
                   "b":[1,1,2,3,2,3]})
data

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,2
5,two,3


In [3]:
data.duplicated()

0    False
1    False
2    False
3    False
4     True
5     True
dtype: bool

In [4]:
data.drop_duplicates()

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3


In [5]:
data["c"]=range(6)
data

Unnamed: 0,a,b,c
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,2,4
5,two,3,5


In [6]:
data.duplicated(["a","b"],keep="last")

0    False
1    False
2     True
3     True
4    False
5    False
dtype: bool

In [7]:
df=pd.DataFrame({"names":["Tim","tom","Sam",
                          "kate","Kim"],
                "scores":[60,50,70,80,40]})
df

Unnamed: 0,names,scores
0,Tim,60
1,tom,50
2,Sam,70
3,kate,80
4,Kim,40


In [8]:
classes={"Tim":"A","Tom":"A","Sam":"B",
         "Kate":"B","Kim":"B"}

In [9]:
n=df["names"].str.capitalize()

In [10]:
df["branches"]=n.map(classes)

In [11]:
df

Unnamed: 0,names,scores,branches
0,Tim,60,A
1,tom,50,A
2,Sam,70,B
3,kate,80,B
4,Kim,40,B


In [12]:
s=pd.Series([80,70,90,60])
s

0    80
1    70
2    90
3    60
dtype: int64

In [13]:
import numpy as np

In [14]:
s.replace(70,np.nan)

0    80.0
1     NaN
2    90.0
3    60.0
dtype: float64

In [15]:
s.replace([70,60],[np.nan,0])

0    80.0
1     NaN
2    90.0
3     0.0
dtype: float64

In [16]:
s.replace({90:100,60:0})

0     80
1     70
2    100
3      0
dtype: int64

In [17]:
df=pd.DataFrame(
    np.arange(12).reshape(3,4),
    index=[0,1,2],
    columns=["tim","tom","kim","sam"])
df

Unnamed: 0,tim,tom,kim,sam
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [18]:
s=pd.Series(["one","two","three"])
df.index=df.index.map(s)

In [19]:
df

Unnamed: 0,tim,tom,kim,sam
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11


In [20]:
df.rename(index=str.title,columns=str.upper)

Unnamed: 0,TIM,TOM,KIM,SAM
One,0,1,2,3
Two,4,5,6,7
Three,8,9,10,11


In [21]:
df.rename(index={"one":"ten"},
          columns={"sam":"kate"},
          inplace=True)
df

Unnamed: 0,tim,tom,kim,kate
ten,0,1,2,3
two,4,5,6,7
three,8,9,10,11


In [22]:
sc=[30,80,40,90,60,45,95,75,55,100,65,85]

In [23]:
x=[20,40,60,80,100]

In [24]:
y=pd.cut(sc,x)
y

[(20, 40], (60, 80], (20, 40], (80, 100], (40, 60], ..., (60, 80], (40, 60], (80, 100], (60, 80], (80, 100]]
Length: 12
Categories (4, interval[int64, right]): [(20, 40] < (40, 60] < (60, 80] < (80, 100]]

In [25]:
y.codes

array([0, 2, 0, 3, 1, 1, 3, 2, 1, 3, 2, 3], dtype=int8)

In [26]:
y.categories

IntervalIndex([(20, 40], (40, 60], (60, 80], (80, 100]], dtype='interval[int64, right]')

In [27]:
pd.value_counts(y)

(80, 100]    4
(40, 60]     3
(60, 80]     3
(20, 40]     2
dtype: int64

In [28]:
y=pd.cut(sc,x,right=False)
y

[[20, 40), [80, 100), [40, 60), [80, 100), [60, 80), ..., [60.0, 80.0), [40.0, 60.0), NaN, [60.0, 80.0), [80.0, 100.0)]
Length: 12
Categories (4, interval[int64, left]): [[20, 40) < [40, 60) < [60, 80) < [80, 100)]

In [29]:
nm=["low", "medium", "high", "very high"]
pd.cut(sc,x,labels=nm)

['low', 'high', 'low', 'very high', 'medium', ..., 'high', 'medium', 'very high', 'high', 'very high']
Length: 12
Categories (4, object): ['low' < 'medium' < 'high' < 'very high']

In [30]:
pd.cut(sc,10)

[(29.93, 37.0], (79.0, 86.0], (37.0, 44.0], (86.0, 93.0], (58.0, 65.0], ..., (72.0, 79.0], (51.0, 58.0], (93.0, 100.0], (58.0, 65.0], (79.0, 86.0]]
Length: 12
Categories (10, interval[float64, right]): [(29.93, 37.0] < (37.0, 44.0] < (44.0, 51.0] < (51.0, 58.0] ... (72.0, 79.0] < (79.0, 86.0] < (86.0, 93.0] < (93.0, 100.0]]

In [31]:
data=np.random.randn(100)
c=pd.qcut(data,4)
c

[(-2.256, -0.511], (-0.511, 0.0523], (-2.256, -0.511], (-2.256, -0.511], (-2.256, -0.511], ..., (-0.511, 0.0523], (-2.256, -0.511], (-2.256, -0.511], (-2.256, -0.511], (0.0523, 0.555]]
Length: 100
Categories (4, interval[float64, right]): [(-2.256, -0.511] < (-0.511, 0.0523] < (0.0523, 0.555] < (0.555, 1.891]]

In [32]:
pd.value_counts(c)

(-2.256, -0.511]    25
(-0.511, 0.0523]    25
(0.0523, 0.555]     25
(0.555, 1.891]      25
dtype: int64

In [33]:
data=pd.DataFrame(np.random.randn(1000,4))
data.head()

Unnamed: 0,0,1,2,3
0,-0.473587,1.241556,-0.06918,0.407533
1,-0.631996,-0.190157,-1.325054,-0.628682
2,0.025751,0.667162,0.144915,-0.688506
3,0.108349,-1.301851,-0.204948,0.028579
4,1.593949,2.102462,-0.079088,-0.475979


In [34]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.010197,0.002034,0.032343,0.025281
std,0.995947,1.056542,1.005935,1.018587
min,-2.602841,-3.51012,-3.44646,-3.217023
25%,-0.666385,-0.702096,-0.647161,-0.647307
50%,0.03653,-0.017787,0.039776,-0.007274
75%,0.685446,0.697016,0.712301,0.681124
max,3.328585,4.094486,3.37241,4.00331


In [35]:
col=data[1]

In [36]:
col[np.abs(col)>3]

80    -3.237908
241    3.022772
380   -3.139942
471    3.551602
556    4.094486
676   -3.510120
796    3.095655
825   -3.044410
Name: 1, dtype: float64

In [37]:
data[(np.abs(data)>3).any(1)]

  data[(np.abs(data)>3).any(1)]


Unnamed: 0,0,1,2,3
5,3.187412,0.760422,1.71549,0.762113
80,-0.609578,-3.237908,0.817687,-0.754109
146,-0.184726,0.139011,3.22869,-0.808419
174,0.771522,0.149249,-3.361996,-0.024638
241,1.563375,3.022772,0.772969,-1.402764
298,-1.385678,-0.413411,-3.019684,0.268225
302,1.256357,1.585772,0.246167,3.091597
380,-0.035684,-3.139942,-1.211871,0.626609
471,0.519207,3.551602,0.814334,0.420783
555,-0.185041,-0.18543,-0.176591,3.093172


In [38]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,1.0
1,-1.0,-1.0,-1.0,-1.0
2,1.0,1.0,1.0,-1.0
3,1.0,-1.0,-1.0,1.0
4,1.0,1.0,-1.0,-1.0


In [39]:
data=pd.DataFrame(
    np.arange(12).reshape(4,3))
data

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [40]:
rw=np.random.permutation(4)
rw

array([0, 3, 1, 2])

In [41]:
data.take(rw)

Unnamed: 0,0,1,2
0,0,1,2
3,9,10,11
1,3,4,5
2,6,7,8


In [42]:
data.sample()

Unnamed: 0,0,1,2
0,0,1,2


In [43]:
data.sample(n=2)

Unnamed: 0,0,1,2
1,3,4,5
3,9,10,11


## Dummy Variable

In [44]:
data=pd.DataFrame(
    {"letter":["c","b","a","b","b","a"],
                   "number":range(6)})
data

Unnamed: 0,letter,number
0,c,0
1,b,1
2,a,2
3,b,3
4,b,4
5,a,5


In [45]:
pd.get_dummies(data["letter"])

Unnamed: 0,a,b,c
0,0,0,1
1,0,1,0
2,1,0,0
3,0,1,0
4,0,1,0
5,1,0,0


In [46]:
data=np.random.randn(10)
data

array([ 0.31550343, -1.42828778,  0.94417712, -1.56933596, -0.42112944,
        0.3511941 ,  0.66302342,  0.12369103, -1.07922926, -0.23282724])

In [47]:
pd.get_dummies(pd.cut(data,4))

Unnamed: 0,"(-1.572, -0.941]","(-0.941, -0.313]","(-0.313, 0.316]","(0.316, 0.944]"
0,0,0,1,0
1,1,0,0,0
2,0,0,0,1
3,1,0,0,0
4,0,1,0,0
5,0,0,0,1
6,0,0,0,1
7,0,0,1,0
8,1,0,0,0
9,0,0,1,0


Don't forget to follow us on [YouTube](http://youtube.com/@Be.Analyst) | [Medium](https://medium.com/@durgeshanalyst) | [Twitter](https://twitter.com/DurgeshBR?t=2LDCN4pHkZOYIo3rMXvKnw&s=09) | [GitHub](http://github.com/durgeshanalyst) | [Linkedin](https://www.linkedin.com/in/durgeshanalyst/) | [Kaggle](https://www.kaggle.com/durgeshanalyst) 😎