### Data Cleaning
#### Anomaly Detection

#### median

In [3]:
import pandas as pd
import numpy as np

x = pd.Series([2.1,2.3,4.5,2.2,2.4])
median = np.median(x)
threshold = 2
outliers = []

for i in x:
    if abs(i - median) > threshold:
        outliers.append(i)

outliers, median

([4.5], 2.3)

#### mean


In [13]:
std = round(np.std(x),2)
mean = x.mean()
outliers = []
for i in x:
    if (mean + std) < i or i < (mean - std):
        outliers.append(i) 

outliers

[4.5]

In [15]:
mean - std, mean + std

(1.79, 3.6100000000000003)

#### Z-score based anomaly detection

In [21]:
outliers = []
for i in x:
    z_score = abs(i - mean)/std
    if z_score > 1.5:
        outliers.append(i)
outliers  


[4.5]

#### inquartile range 

In [26]:
'''
FIRST QUARTILE is 25% of the dataset
SECOND QUARTILE is median value of the dataset
THIRD QUARTILE is 75% of the dataset
'''
Q1, Q3 = np.percentile(x,[25,75])
IQR = Q3 - Q1
outliers = []

for j, i in enumerate(x):
    if i < (Q1 - 1.5 * IQR) or i > (Q3 + 1.5 * IQR):
        outliers.append(f'{i} at place {j}')
outliers

['4.5 at place 2']

### Missing values

In [43]:
from numpy import NaN
'''
NaN = Not a number
'''
d1 = {'Name': ['Edison','Edward','James','Neesham','Stuart'],'Age': [28,27,NaN,36,27]}
df = pd.DataFrame(d1)

df.isnull().sum()

Name    0
Age     1
dtype: int64

In [33]:
df.isnull()

Unnamed: 0,Name,Age
0,False,False
1,False,False
2,False,True
3,False,False


#### dropna() : Removes rows which contain null values

In [35]:
df.dropna(inplace= False)

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
3,Neesham,36.0


#### fillna() : Fill empty cells with mean/median/mode value

In [37]:
'''
WITH MEAN VALUE
'''
df.fillna(df.mean(), inplace=True)

  df.fillna(df.mean(), inplace=True)


In [38]:
df

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,30.333333
3,Neesham,36.0


In [48]:
"""
WITH MODE VALUE
"""
df['Age'].fillna(df['Age'].mode()[0],inplace=True)
df

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edward,27.0
2,James,27.0
3,Neesham,36.0
4,Stuart,27.0


### Regular Expressions

In [55]:
import re
x = 'Python good, python cool, Python works'
re.findall(r'[Pp]ython',x)

['Python', 'python', 'Python']

In [54]:
txt = 'Python was released in 1991'
print(re.findall('\d',txt))
print(re.findall(r'\d+', txt))

['1', '9', '9', '1']
['1991']


In [60]:
'''
Gives first match
'''
re.search(r'[Pp]ython', txt)#.span()


<re.Match object; span=(0, 6), match='Python'>

#### replacing words

In [62]:
txt = 'C is my favorite language'
re.sub(pattern= 'C', repl='Python', string = txt)

'Python is my favorite language'

#### Feature Scaling

In [161]:
d = {'Age':[28,27,30,36,27], 'Salary': [10000,15000,11000,11000,13000]}
df = pd.DataFrame(d)
df

Unnamed: 0,Age,Salary
0,28,10000
1,27,15000
2,30,11000
3,36,11000
4,27,13000


In [162]:
df = (df - df.min() / (df.max() - df.min()))
df

Unnamed: 0,Age,Salary
0,25.0,9998.0
1,24.0,14998.0
2,27.0,10998.0
3,33.0,10998.0
4,24.0,12998.0


In [163]:
'''
Feature Scaling
'''
df = (df - df.mean() / df.std())
df

Unnamed: 0,Age,Salary
0,17.965818,9992.001
1,16.965818,14992.001
2,19.965818,10992.001
3,25.965818,10992.001
4,16.965818,12992.001


In [165]:
df.std()

Age          3.781534
Salary    2000.000000
dtype: float64

###### I didn't understand it