In [1]:
# Data extracted for real world is not clean. Contains inconsistencies, incomplete data, irrelevant or missing values
# which need to be cleaned
import pandas as pd
import numpy as np


In [9]:
x = pd.Series([2.1, 2.3, 4.5, 2.2, 2.5])

median = np.median(x)
print(median)
# outlier = Parte aislada // threshold = Limite
threshold = 2 # That's the limit. Everything beyond that will be considered as outlier.
outlier = []

# I go through the serie
for item in x:
    if abs(item - median) > threshold:
        outlier.append(item)

print(outlier) # 4.5 is an anomaly or an outlier

2.3
[4.5]


In [15]:
# Mean-based anomaly detection == detección de anomalías basada en la media
x = pd.Series([2.1, 2.3, 4.5, 2.2, 2.4])

mean = np.mean(x) 
print(mean) # Average or media 2.7
std = np.std(x) 
print(std) # Standard deviation 0.9055385138137416
outliers = []

for item in x:
    if (item < mean - std) or (item > mean + std): # one of those values should be possitive. Ex: (2.1 < 2.7 - 0.9055385138137416)
        outliers.append(item)

outliers

2.7
0.9055385138137416


[4.5]

In [19]:
# Z-score-based Anomaly Detection. Another technique to detect anomalities
# Z-score-based is a statistical measure showing how many standar deviations a certain value is from the mean
# Mean-based anomaly detection == detección de anomalías basada en la media
x = pd.Series([2.1, 2.3, 4.5, 2.2, 2.4])

mean = np.mean(x) 
print(mean) # Average or media 2.7
std = np.std(x) 
print(std) # Standard deviation 0.9055385138137416
outliers = []

for item in x:
    z_score = (item - mean) / std
    if z_score > 1.5: # 4.5 will be the value because is more than 1.5 from the Standard deviation. 1.5 is an arbitrary value
        outliers.append(item)

print(outliers)


2.7
0.9055385138137416
[4.5]


In [21]:
# Interquartile Range for Anomaly Detection. A quartile divides the dataset (sorter asc) into 3 points and 4 intervals
# The interquartile range (IQR) is a difference between the 3rd quartile and 1st quartile ->> IQR = Q3 - Q1
x = pd.Series([2.1, 2.3, 4.5, 2.2, 2.5])

Q1, Q3 = np.percentile(x, [25,75])
IQR = Q3 - Q1
outlier = []

for item in x:
    if item < (Q1 - 1.5 * IQR) or item > (Q3 + 1.5 * IQR):
        outlier.append(item)

outlier

[4.5]

In [29]:
# Dealing with missing values in the cleaning stage. 
# Example those NULL or NaN (Stands for Not a Number which represent a missing value) values
# The .isnull() function tells us if a cell is empty or not
# We can use the .sum() function with the .isnull() function to find total numbrer of missing values in the data

from numpy import NaN
data1 = {'Name':['Edison', 'Edwards', 'James', 'Neesham'], 'Age':[28, 27, NaN, 36]}

data = pd.DataFrame.from_dict(data1)
data


Unnamed: 0,Name,Age
0,Edison,28.0
1,Edwards,27.0
2,James,
3,Neesham,36.0


In [31]:
data.isnull() # Is gonna throw True when there's a missing value (NaN)

Unnamed: 0,Name,Age
0,False,False
1,False,False
2,False,True
3,False,False


In [33]:
data.isnull().sum() # Is gonna throw how many missing values are in a specific column. In this case Age = 1

Name    0
Age     1
dtype: int64

In [37]:
# deleting the missing value. Ergo, the specific row
data.dropna(inplace = True) # Deletes the record with Missing values. The 3rd one in this case
data


Unnamed: 0,Name,Age
0,Edison,28.0
1,Edwards,27.0
3,Neesham,36.0


In [67]:
# Replacing missing values with mean/median/mode values would be a great option given that it won't affect the statistics actually
# The .fillna() function will replace those missing values
import pandas as pd
import numpy as np
from numpy import NaN

data1 = {'Name':['Edison', 'Edwards', 'James', 'Neesham'], 'Age':[28, 27, NaN, 36]}
data = pd.DataFrame.from_dict(data1)

# Calculate mean for numeric columns
numeric_means = data.select_dtypes(include=[np.number]).mean()

data


Unnamed: 0,Name,Age
0,Edison,28.0
1,Edwards,27.0
2,James,
3,Neesham,36.0


In [69]:
data.fillna(numeric_means, inplace=True) # Replace the NaN value by the average
data

Unnamed: 0,Name,Age
0,Edison,28.0
1,Edwards,27.0
2,James,30.333333
3,Neesham,36.0


In [87]:
# Regular Expressions
# Ex: re.findall(), etc.
import re # Import the Regular Expression library

txt = 'Python is my favorite programming languague. I love Python. Do not u love Python as well?'
x = re.findall('Python', txt) # finds all the occurrances for the 'Python' word
x

['Python', 'Python', 'Python']

In [89]:
print('The quantity of times the Python word occurred is: ', len(x))

The quantity of times the Python word occurred is:  3


In [91]:
# The hat symbol(^) before the pattern to search for an occurrance which starts with that pattern
x = re.findall('^Python', txt) # finds the occurrance of just 1 'Python' word which appears at the beginning
x

['Python']

In [103]:
# To match numbers, use the '\d' option to match for numbers
txt = 'Python was released in 1991.'
re.findall(r'\d', txt) # If you don't use the 'r' statement is gonna fail

['1', '9', '9', '1']

In [105]:
re.findall(r'\d+', txt) # Sets the entire occurrance with the '+' option

['1991']

In [107]:
# Convert the series into strings in order to search for a special word or phrase
import pandas as pd
import re

txtList = ['Pakistan','Indonesia','Jordan','Pakistan']
txt = pd.Series(txtList)
txt

0     Pakistan
1    Indonesia
2       Jordan
3     Pakistan
dtype: object

In [109]:
re.findall('Pakistan', txt.to_string())

['Pakistan', 'Pakistan']

In [113]:
# re.search() function is quite usefull. Returns a Match Object if there's a pattern match
import pandas as pd
import re

txt = 'Hello, world!'
match_object = re.search('world', txt) # It's case sensitive. If you search for 'World' or 'WORLD' ioio
match_object


<re.Match object; span=(7, 12), match='world'>

In [115]:
match_object.span() # Where the 'world' word is being displayed

(7, 12)

In [119]:
# re.sub() function replaces a String with a different text
import pandas as pd
import re

txt = 'C used to be one of my favorites programming languages.'
re.sub(pattern='C', repl='C++', string=txt)


'C++ used to be one of my favorites programming languages.'