# ***DATA PROCESSING***

## **Analisis Data Menggunakan POWER BI**

![testing](checking-missing-value-sneakers.png "test")

## **Koneksi Ke Database**

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

In [2]:
load_dotenv()

# Ambil variabel dari .env
user_postgres = os.getenv("USER_POSTGRES")
password_postgres = os.getenv("PASS_POSTGRES")
host_postgres = os.getenv("HOST_POSTGRES")
port_postgres = os.getenv("PORT_POSTGRES")
db_postgres = os.getenv("DB_POSTGRES")

user_mysql = os.getenv("USER_MYSQL")
password_mysql = os.getenv("PASS_MYSQL")
host_mysql = os.getenv("HOST_MYSQL")
port_mysql = os.getenv("PORT_MYSQL")
db_mysql = os.getenv("DB_MYSQL")


postgres_conn = f"postgresql+psycopg2://{user_postgres}:{password_postgres}@{host_postgres}:{port_postgres}/{db_postgres}"

mysql_conn = f"mysql+pymysql://{user_mysql}:{password_mysql}@{host_mysql}:{port_mysql}/{db_mysql}"

postgres_engine = create_engine(postgres_conn)
postgres_engine = create_engine(mysql_conn)

python-dotenv could not parse statement starting at line 13
python-dotenv could not parse statement starting at line 17


In [3]:
mysql_df_iris = pd.read_sql("SELECT * FROM iris_no_class", mysql_conn)
postgres_df_iris = pd.read_sql("SELECT * FROM iris_no_class", postgres_conn)

In [4]:
# mengabungkan 2 database
merge_df = pd.merge(mysql_df_iris, postgres_df_iris, left_on="id", right_on='id', how='outer')

In [5]:
print(merge_df)

      id  sepal_length  sepal_width  petal_length  petal_width
0      1           5.1          3.5           1.4          0.2
1      2           4.9          3.0           1.4          0.2
2      3           4.7          3.2           1.3          0.2
3      4           4.6          3.1           1.5          0.2
4      5           5.0          3.6           1.4          0.2
..   ...           ...          ...           ...          ...
145  146           6.7          3.0           5.2          2.3
146  147           6.3          2.5           5.0          1.9
147  148           6.5          3.0           5.2          2.0
148  149           6.2          3.4           5.4          2.3
149  150           5.9          3.0           5.1          1.8

[150 rows x 5 columns]


In [6]:
table_iris = merge_df.copy()

## **Algoritma ABOD**

In [7]:
from pycaret.anomaly import *

# Setup PyCaret untuk anomaly detection
exp_ano = setup(data=table_iris)

# Buat model ABOD
abod_model = create_model('abod')

# Assign label outlier ke setiap baris
dataset_outliers = assign_model(abod_model)

# Hapus baris yang dianggap outlier
dataset_clean = dataset_outliers[dataset_outliers['Anomaly'] == 1]

dataset_clean

Unnamed: 0,Description,Value
0,Session id,5149
1,Original data shape,"(150, 5)"
2,Transformed data shape,"(150, 5)"
3,Numeric features,5
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,CPU Jobs,-1
9,Use GPU,False


Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,Anomaly,Anomaly_Score
98,99,5.1,2.5,3.0,1.1,1,-0.006736
106,107,4.9,2.5,4.5,1.7,1,-0.001276
117,118,7.7,3.8,6.7,2.2,1,-0.005361
118,119,7.7,2.6,6.9,2.3,1,-0.005204
119,120,6.0,2.2,5.0,1.5,1,-0.00608
122,123,7.7,2.8,6.7,2.0,1,-0.004978
135,136,7.7,3.0,6.1,2.3,1,-0.006185
149,150,5.9,3.0,5.1,1.8,1,-0.006176


## **Algoritma KNN**

In [8]:
from pycaret.anomaly import *

# Setup PyCaret untuk anomaly detection
exp_ano = setup(data=table_iris)

# Buat model ABOD
abod_model = create_model('knn')

# Assign label outlier ke setiap baris
dataset_outliers = assign_model(abod_model)

# Hapus baris yang dianggap outlier
dataset_clean = dataset_outliers[dataset_outliers['Anomaly'] == 1]

dataset_clean

Unnamed: 0,Description,Value
0,Session id,8311
1,Original data shape,"(150, 5)"
2,Transformed data shape,"(150, 5)"
3,Numeric features,5
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,CPU Jobs,-1
9,Use GPU,False


Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,Anomaly,Anomaly_Score
0,1,5.1,3.5,1.4,0.2,1,5.037857
1,2,4.9,3.0,1.4,0.2,1,4.146083
48,49,5.3,3.7,1.5,0.2,1,4.031129
49,50,5.0,3.3,1.4,0.2,1,4.155719
50,51,7.0,3.2,4.7,1.4,1,4.164132
51,52,6.4,3.2,4.5,1.5,1,4.08534
148,149,6.2,3.4,5.4,2.3,1,4.048456
149,150,5.9,3.0,5.1,1.8,1,5.155579


## **Algoritma LOF**

In [9]:
from pycaret.anomaly import *

# Setup PyCaret untuk anomaly detection
exp_ano = setup(data=table_iris)

# Buat model ABOD
abod_model = create_model('lof')

# Assign label outlier ke setiap baris
dataset_outliers = assign_model(abod_model)

# Hapus baris yang dianggap outlier
dataset_clean = dataset_outliers[dataset_outliers['Anomaly'] == 1]

dataset_clean

Unnamed: 0,Description,Value
0,Session id,8216
1,Original data shape,"(150, 5)"
2,Transformed data shape,"(150, 5)"
3,Numeric features,5
4,Preprocess,True
5,Imputation type,simple
6,Numeric imputation,mean
7,Categorical imputation,mode
8,CPU Jobs,-1
9,Use GPU,False


Unnamed: 0,id,sepal_length,sepal_width,petal_length,petal_width,Anomaly,Anomaly_Score
0,1,5.1,3.5,1.4,0.2,1,1.256153
1,2,4.9,3.0,1.4,0.2,1,1.217657
2,3,4.7,3.2,1.3,0.2,1,1.183005
3,4,4.6,3.1,1.5,0.2,1,1.153191
146,147,6.3,2.5,5.0,1.9,1,1.150096
147,148,6.5,3.0,5.2,2.0,1,1.179429
148,149,6.2,3.4,5.4,2.3,1,1.214298
149,150,5.9,3.0,5.1,1.8,1,1.253521
