In [1]:
import pandas as pd
import numpy as np

In [14]:
ages = [20,30,34,25,23,34,94,28,78,97,74,55,65,11]

In [15]:
bins = [18, 25, 35, 60, 100]

In [16]:
cats = pd.cut(ages,bins)

In [17]:
cats.codes

array([ 0,  1,  1,  0,  0,  1,  3,  1,  3,  3,  3,  2,  3, -1], dtype=int8)

In [18]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [19]:
# these are counts for your categeories (bins)
# the square bracket are numbers that are inclusive
cats.value_counts()

(18, 25]     3
(25, 35]     4
(35, 60]     1
(60, 100]    5
dtype: int64

In [22]:
# you can change whats inclusive by adding right=False
pd.cut(ages,bins,right=False).value_counts()

[18, 25)     2
[25, 35)     5
[35, 60)     1
[60, 100)    5
dtype: int64

In [30]:
# note that number of bins must equal the categories you provide
group_names = ['young','young adult','middle aged','senior']

In [31]:
pd.cut(ages,bins,labels=group_names).value_counts()

young          3
young adult    4
middle aged    1
senior         5
dtype: int64

In [33]:
data = np.random.rand(20)

In [34]:
data

array([0.82832953, 0.31069025, 0.53071546, 0.59441054, 0.94978088,
       0.05700384, 0.47587921, 0.6439479 , 0.66542692, 0.53095888,
       0.77435257, 0.20608926, 0.79142758, 0.56488209, 0.80093574,
       0.42085743, 0.00947979, 0.9218384 , 0.10216149, 0.94032377])

In [44]:
# make the data only go to 2 decimal places with precision = 2
# you can create 4 equal bins just by calling "4"
pd.cut(data,4,precision=2).value_counts()

(0.0085, 0.24]    4
(0.24, 0.48]      3
(0.48, 0.71]      6
(0.71, 0.95]      7
dtype: int64

In [47]:
# you can even make your own quantiles
pd.cut(data,[0.2,0.4,0.8,0.9,1]).value_counts()

(0.2, 0.4]     2
(0.4, 0.8]    10
(0.8, 0.9]     2
(0.9, 1.0]     3
dtype: int64

In [96]:
######## DETECTING AND FILTERING OUTLIERS ############

data = pd.DataFrame(np.random.randn(1000,4))

In [65]:
data

Unnamed: 0,0,1,2,3
0,-1.889831,1.780530,-0.164444,1.079857
1,0.974172,0.167292,0.795169,2.718738
2,0.571789,0.742655,0.677651,-0.600594
3,-1.853175,1.059893,-0.507528,0.154973
4,2.127534,-0.751087,0.131482,0.685148
...,...,...,...,...
995,-1.189515,-0.624553,-1.094581,1.386863
996,-0.410146,-1.719530,-1.735341,-0.661118
997,0.962053,0.698688,-2.404061,-0.042233
998,-0.718035,0.494356,1.014700,-0.312512


In [66]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.009966,0.02751,-0.053475,0.051777
std,0.983791,0.986366,0.993749,1.019902
min,-2.927872,-3.747846,-3.721132,-3.636675
25%,-0.664135,-0.613643,-0.722207,-0.622534
50%,-0.026088,0.055034,-0.0592,0.057919
75%,0.624989,0.69823,0.634227,0.728117
max,2.994025,3.479657,3.001512,3.401778


In [76]:
# find the values in column 2 that exceed 3 in absolute value
# first call the column
data[2]

0     -0.164444
1      0.795169
2      0.677651
3     -0.507528
4      0.131482
         ...   
995   -1.094581
996   -1.735341
997   -2.404061
998    1.014700
999    0.265366
Name: 2, Length: 1000, dtype: float64

In [80]:
# then apply the np.abs
np.abs(data[2])>3

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: 2, Length: 1000, dtype: bool

In [85]:
# then apply it to the dataframe of interest to pull the number
data[2][np.abs(data[2])>3]

368   -3.721132
562    3.001512
Name: 2, dtype: float64

In [86]:
# to find any rows, just use any(1)
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
37,-1.70022,3.479657,0.236002,0.719425
317,-0.124081,-3.747846,-0.796916,1.180088
361,0.725134,-3.143969,0.016261,2.536841
368,-1.669058,0.661972,-3.721132,0.033625
420,-1.471452,2.788411,0.674711,-3.636675
562,-0.319202,-0.01496,3.001512,-0.852022
672,0.969873,1.094043,-0.784147,3.182565
736,0.259923,-0.769835,-0.090898,3.401778


In [97]:
# To CAP the data at a certain number, say 3, use np.sign()
data[np.abs(data)>3] = np.sign(data) * 3

In [98]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.020548,-0.056363,-0.032793,-0.050622
std,0.990216,0.964763,1.020408,0.992972
min,-3.0,-3.0,-3.0,-2.630485
25%,-0.639073,-0.71022,-0.706074,-0.715732
50%,-0.014642,-0.041688,-0.029876,-0.089555
75%,0.684616,0.561885,0.626704,0.633854
max,3.0,3.0,3.0,3.0


In [106]:
data[(np.abs(data)>2.9).any(1)]

Unnamed: 0,0,1,2,3
34,-3.0,0.139937,0.394483,-1.506567
97,0.099491,0.310745,-3.0,-0.638927
256,3.0,0.666366,1.558274,0.606236
323,-0.614274,0.472733,-3.0,0.758644
383,0.016922,3.0,0.638276,0.790777
466,0.807011,-0.543703,2.977265,2.229398
526,0.108402,-0.337732,-2.997703,1.762807
735,-0.001347,-0.5527,0.423396,3.0
785,-0.434953,-3.0,-0.494839,-0.421786
789,3.0,-1.81618,-1.278896,0.628412


In [108]:
data[np.abs(data)>2] = np.sign(data) * 2

In [109]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.021528,-0.055768,-0.028975,-0.050819
std,0.956799,0.935689,0.976377,0.963611
min,-2.0,-2.0,-2.0,-2.0
25%,-0.639073,-0.71022,-0.706074,-0.715732
50%,-0.014642,-0.041688,-0.029876,-0.089555
75%,0.684616,0.561885,0.626704,0.633854
max,2.0,2.0,2.0,2.0


In [114]:
##### PERMUTATION AND RANDOM SAMPLING #########

df = pd.DataFrame(np.arange(5 * 4).reshape(5,4))

In [115]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [122]:
sampler = np.random.permutation(5)  #take a random permutation array([3, 1, 2, 0, 4])

In [123]:
df.take(sampler) #see here now, that the 3rd index is first, 1st index is second, 2nd index is third, etc

Unnamed: 0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11
0,0,1,2,3
4,16,17,18,19


In [129]:
#can take random sample of the entire dataframe, WITHOUT REPLACEMENT
df.sample(n=3)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
3,12,13,14,15


In [130]:
df.sample(n=3,replace=True) #WITH replacement

Unnamed: 0,0,1,2,3
3,12,13,14,15
3,12,13,14,15
4,16,17,18,19


In [138]:
##### CREATING A DUMMY VARIABLE ######

df = pd.DataFrame({'key':['b','b','a','c','a','b'],
             'data1':range(6)})

In [139]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [140]:
pd.get_dummies(df['key']) #so now these a's, b's and c's are all converted to 0 or 1

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [141]:
pd.get_dummies(df['key'], prefix='key') # you can give the dummy variables names using prefix=

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [142]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [143]:
df[['data1']].join(dummies) # now join your data1 column with the dummies keys

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [147]:
df_with_dummies = df[['data1']].join(dummies)

In [None]:
mnames = ['movie_id','title','genres']