In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('shopify_orders.csv')

In [3]:
mean_total = df['Total'].mean()
std_total = df['Total'].std()
df['z_score'] = (df['Total'] - mean_total) / std_total

In [4]:
print(f'{mean_total} total mean')
print(f'{std_total} standard deviation total')




6176.2 total mean
8846.438898431023 standard deviation total


In [5]:
# Flag suspicious orders
df['Fraud_Flag_ZScore'] = np.where(abs(df['z_score']) > 2, 'SUSPICIOUS', 'OK')


In [7]:
print(df[df['Fraud_Flag_ZScore'] == 'SUSPICIOUS'])

     Name                      Email Financial Status Fulfillment Status  \
19  #1020     test.buyer777@mail.com             paid        unfulfilled   
42  #1043     test.buyer777@mail.com             paid        unfulfilled   
48  #1049  fakeemail001@tempmail.com             paid        unfulfilled   

      Total  Subtotal  Shipping  Discount Amount           Created at  \
19  25000.0   24200.0     800.0              0.0  2026-01-10 04:10:00   
42  26000.0   25200.0     800.0              0.0  2026-01-19 04:50:00   
48  24000.0   23200.0     800.0              0.0  2026-01-21 01:45:00   

   Billing City Billing Province Billing Country  Shipping City  \
19     San Juan     Metro Manila              PH  Cotabato City   
42     San Juan     Metro Manila              PH        Dipolog   
48  Quezon City     Metro Manila              PH       Valencia   

      Shipping Province Shipping Country Payment Method  Lineitem quantity  \
19          Maguindanao               PH    Credit Card

In [10]:
df.head(3)

Unnamed: 0,Name,Email,Financial Status,Fulfillment Status,Total,Subtotal,Shipping,Discount Amount,Created at,Billing City,Billing Province,Billing Country,Shipping City,Shipping Province,Shipping Country,Payment Method,Lineitem quantity,Lineitem price,z_score,Fraud_Flag_ZScore
0,#1001,maria.santos@gmail.com,paid,fulfilled,1250.0,1100.0,150.0,0.0,2026-01-02 10:23:00,Davao City,Davao del Sur,PH,Davao City,Davao del Sur,PH,Credit Card,2,550.0,-0.556857,OK
1,#1002,john.reyes@yahoo.com,paid,fulfilled,850.0,750.0,100.0,0.0,2026-01-02 14:15:00,Cebu City,Cebu,PH,Cebu City,Cebu,PH,Credit Card,1,750.0,-0.602073,OK
2,#1003,anna.cruz@gmail.com,paid,fulfilled,2100.0,1950.0,150.0,0.0,2026-01-03 09:45:00,Manila,Metro Manila,PH,Manila,Metro Manila,PH,PayPal,3,650.0,-0.460773,OK


In [11]:
Q1 = df['Total'].quantile(0.25)
Q3 = df['Total'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df['Fraud_Flag_IQR'] = np.where(
    (df['Total'] < lower) | (df['Total'] > upper), 
    'OUTLIER','OK'
)

In [12]:
print(f"Lower Bound:{lower},Upper Bound:{upper}")
print(df[df['Fraud_Flag_IQR']== 'OUTLIER'])

Lower Bound:-15712.5,Upper Bound:28547.5
Empty DataFrame
Columns: [Name, Email, Financial Status, Fulfillment Status, Total, Subtotal, Shipping, Discount Amount, Created at, Billing City, Billing Province, Billing Country, Shipping City, Shipping Province, Shipping Country, Payment Method, Lineitem quantity, Lineitem price, z_score, Fraud_Flag_ZScore, Fraud_Flag_IQR]
Index: []

[0 rows x 21 columns]


In [13]:
df.head(5)

Unnamed: 0,Name,Email,Financial Status,Fulfillment Status,Total,Subtotal,Shipping,Discount Amount,Created at,Billing City,...,Billing Country,Shipping City,Shipping Province,Shipping Country,Payment Method,Lineitem quantity,Lineitem price,z_score,Fraud_Flag_ZScore,Fraud_Flag_IQR
0,#1001,maria.santos@gmail.com,paid,fulfilled,1250.0,1100.0,150.0,0.0,2026-01-02 10:23:00,Davao City,...,PH,Davao City,Davao del Sur,PH,Credit Card,2,550.0,-0.556857,OK,OK
1,#1002,john.reyes@yahoo.com,paid,fulfilled,850.0,750.0,100.0,0.0,2026-01-02 14:15:00,Cebu City,...,PH,Cebu City,Cebu,PH,Credit Card,1,750.0,-0.602073,OK,OK
2,#1003,anna.cruz@gmail.com,paid,fulfilled,2100.0,1950.0,150.0,0.0,2026-01-03 09:45:00,Manila,...,PH,Manila,Metro Manila,PH,PayPal,3,650.0,-0.460773,OK,OK
3,#1004,fakeemail001@tempmail.com,paid,unfulfilled,18500.0,18000.0,500.0,0.0,2026-01-03 02:13:00,Manila,...,PH,Cagayan de Oro,Misamis Oriental,PH,Credit Card,10,1800.0,1.39308,OK,OK
4,#1005,pedro.garcia@gmail.com,paid,fulfilled,1500.0,1350.0,150.0,0.0,2026-01-04 11:30:00,Zamboanga City,...,PH,Zamboanga City,Zamboanga del Sur,PH,GCash,2,675.0,-0.528597,OK,OK


In [16]:
email_counts = df['Email'].value_counts()
df['Order_Count'] = df['Email'].map(email_counts)

df['Fraud_Flag_Frequency'] = np.where(
    df['Order_Count'] > 3, 'FREQUENT','OK'
)

print(df[df['Fraud_Flag_Frequency']== 'FREQUENT'])

     Name                         Email Financial Status Fulfillment Status  \
3   #1004     fakeemail001@tempmail.com             paid        unfulfilled   
6   #1007     fakeemail001@tempmail.com             paid        unfulfilled   
10  #1011  suspicious.buyer99@proton.me             paid        unfulfilled   
13  #1014     fakeemail001@tempmail.com             paid        unfulfilled   
17  #1018  suspicious.buyer99@proton.me             paid        unfulfilled   
22  #1023     fakeemail001@tempmail.com             paid        unfulfilled   
24  #1025  discount.hunter@fakemail.net             paid          fulfilled   
27  #1028  suspicious.buyer99@proton.me             paid        unfulfilled   
28  #1029  discount.hunter@fakemail.net             paid          fulfilled   
34  #1035     fakeemail001@tempmail.com             paid        unfulfilled   
37  #1038  discount.hunter@fakemail.net             paid          fulfilled   
38  #1039  suspicious.buyer99@proton.me             

In [17]:
df['Address_Mismatch'] = np.where(
    df['Billing City'] != df['Shipping City'],
    'MISMATCH','MATCH'
)

In [21]:
df['Created at'] = pd.to_datetime(df['Created at'])
df['Order_Hour'] = df['Created at'].dt.hour

df['Fraud_Flag_Time'] = np.where(
    df['Order_Hour'].between(1,5),
    'ODD_HOUR','OK'
)

print(df[df['Fraud_Flag_Time'] == 'ODD_HOUR'])

     Name                         Email Financial Status Fulfillment Status  \
3   #1004     fakeemail001@tempmail.com             paid        unfulfilled   
6   #1007     fakeemail001@tempmail.com             paid        unfulfilled   
10  #1011  suspicious.buyer99@proton.me             paid        unfulfilled   
13  #1014     fakeemail001@tempmail.com             paid        unfulfilled   
17  #1018  suspicious.buyer99@proton.me             paid        unfulfilled   
19  #1020        test.buyer777@mail.com             paid        unfulfilled   
22  #1023     fakeemail001@tempmail.com             paid        unfulfilled   
27  #1028  suspicious.buyer99@proton.me             paid        unfulfilled   
31  #1032        test.buyer777@mail.com             paid        unfulfilled   
34  #1035     fakeemail001@tempmail.com             paid        unfulfilled   
38  #1039  suspicious.buyer99@proton.me             paid        unfulfilled   
42  #1043        test.buyer777@mail.com             

In [19]:
df.head(3)

Unnamed: 0,Name,Email,Financial Status,Fulfillment Status,Total,Subtotal,Shipping,Discount Amount,Created at,Billing City,...,Shipping Country,Payment Method,Lineitem quantity,Lineitem price,z_score,Fraud_Flag_ZScore,Fraud_Flag_IQR,Order_Count,Fraud_Flag_Frequency,Address_Mismatch
0,#1001,maria.santos@gmail.com,paid,fulfilled,1250.0,1100.0,150.0,0.0,2026-01-02 10:23:00,Davao City,...,PH,Credit Card,2,550.0,-0.556857,OK,OK,1,OK,MATCH
1,#1002,john.reyes@yahoo.com,paid,fulfilled,850.0,750.0,100.0,0.0,2026-01-02 14:15:00,Cebu City,...,PH,Credit Card,1,750.0,-0.602073,OK,OK,1,OK,MATCH
2,#1003,anna.cruz@gmail.com,paid,fulfilled,2100.0,1950.0,150.0,0.0,2026-01-03 09:45:00,Manila,...,PH,PayPal,3,650.0,-0.460773,OK,OK,1,OK,MATCH


In [22]:
df['Discount_Pct'] = df['Discount Amount'] / df['Subtotal']

df['Fraud_Flag_Discount'] = np.where(
    df['Discount_Pct'] > 0.3,
    'HIGH_DISCOUNT','OK'
)

In [23]:
df.head(3)

Unnamed: 0,Name,Email,Financial Status,Fulfillment Status,Total,Subtotal,Shipping,Discount Amount,Created at,Billing City,...,z_score,Fraud_Flag_ZScore,Fraud_Flag_IQR,Order_Count,Fraud_Flag_Frequency,Address_Mismatch,Order_Hour,Fraud_Flag_Time,Discount_Pct,Fraud_Flag_Discount
0,#1001,maria.santos@gmail.com,paid,fulfilled,1250.0,1100.0,150.0,0.0,2026-01-02 10:23:00,Davao City,...,-0.556857,OK,OK,1,OK,MATCH,10,OK,0.0,OK
1,#1002,john.reyes@yahoo.com,paid,fulfilled,850.0,750.0,100.0,0.0,2026-01-02 14:15:00,Cebu City,...,-0.602073,OK,OK,1,OK,MATCH,14,OK,0.0,OK
2,#1003,anna.cruz@gmail.com,paid,fulfilled,2100.0,1950.0,150.0,0.0,2026-01-03 09:45:00,Manila,...,-0.460773,OK,OK,1,OK,MATCH,9,OK,0.0,OK


In [24]:
df.to_csv('shopify_orders_clean.csv', index=False)