In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../ecommerce_data_with_trends.csv')
data.head()

Unnamed: 0,transaction_id,timestamp,customer_id,customer_name,city,customer_type,product_name,category,price,quantity,total_amount
0,TX_89a20095-f7be-4bc1-bb58-a305e3fc8313,2023-10-30 03:01:46.571042,6933,David Hays,New Sabrina,B2C,Furniture Product_10,Home & Kitchen > Furniture,246.08,4,984.32
1,TX_a6b15a50-47b9-428c-b297-ce365acb061a,2023-10-30 03:06:07.918040,9328,Adam Oconnell,East Katherineton,B2C,Non-Fiction Product_15,Books > Non-Fiction,792.3,4,3169.2
2,TX_abdde2cb-3752-4399-84f4-c91c098b195f,2023-10-30 03:06:28.475922,6766,Jerry Brown,Lukefort,B2B,Bedding Product_1,Home & Kitchen > Bedding,685.73,40,27429.2
3,TX_ba162310-0807-4dee-818f-fa014f9880ef,2023-10-30 03:06:59.374222,9111,Craig Martinez,South Leonard,B2B,Shoes Product_11,Fashion > Shoes,404.96,96,38876.16
4,TX_60ec44fd-2172-4ffa-8c67-4b399c59ed7c,2023-10-30 03:08:29.580560,1763,David Wood,Jacksonstad,B2B,Supplements Product_5,Health & Personal Care > Supplements,927.67,35,32468.45


In [4]:
# Basic statistical summary
print("Statistical Summary:")
print(data.describe())

# Value counts for categorical columns
print("\nProduct Category Distribution:")
print(data['category'].value_counts())

# Time-based analysis
data['timestamp'] = pd.to_datetime(data['timestamp'])
print("\nTransactions by Month:")
print(data.groupby(data['timestamp'].dt.month)['transaction_id'].count())

# Check for outliers in numerical columns
print("\nChecking for outliers in total_amount:")
Q1 = data['total_amount'].quantile(0.25)
Q3 = data['total_amount'].quantile(0.75)
IQR = Q3 - Q1
outliers = data[(data['total_amount'] < (Q1 - 1.5 * IQR)) | (data['total_amount'] > (Q3 + 1.5 * IQR))]
print(f"Number of potential outliers in total_amount: {len(outliers)}")

# Check for unusual quantities
print("\nQuantity Statistics:")
print(data['quantity'].describe())

Statistical Summary:
          customer_id           price        quantity    total_amount
count  1000000.000000  1000000.000000  1000000.000000  1000000.000000
mean      5002.847309      510.471790       29.922302    15258.971813
std       2888.689217      280.779371       32.558757    20702.528840
min          1.000000        9.880000        1.000000        9.880000
25%       2501.000000      273.140000        3.000000     1174.700000
50%       5009.000000      532.440000       12.000000     3751.000000
75%       7504.000000      751.120000       56.000000    23350.130000
max      10000.000000      998.530000      150.000000   131862.080000

Product Category Distribution:
category
Books > Non-Fiction                       56745
Home & Kitchen > Kitchen Appliances       56714
Books > Fiction                           56418
Health & Personal Care > Supplements      54002
Electronics > Computers                   53972
Electronics > Audio                       53969
Sports & Outdoors > 

In [5]:
print(f"The shape of the data is : {data.shape}")
print(f"The columns of the data are : {data.columns}")
print(f"The data types of the columns are : {data.dtypes}")

The shape of the data is : (1000000, 11)
The columns of the data are : Index(['transaction_id', 'timestamp', 'customer_id', 'customer_name', 'city',
       'customer_type', 'product_name', 'category', 'price', 'quantity',
       'total_amount'],
      dtype='object')
The data types of the columns are : transaction_id            object
timestamp         datetime64[ns]
customer_id                int64
customer_name             object
city                      object
customer_type             object
product_name              object
category                  object
price                    float64
quantity                   int64
total_amount             float64
dtype: object


In [6]:
print(f"The number of missing values in the data are : {data.isnull().sum()}")
#print(f"The number of missing values in the data are : {data.isna().sum()}") # Returns 0 also


The number of missing values in the data are : transaction_id    0
timestamp         0
customer_id       0
customer_name     0
city              0
customer_type     0
product_name      0
category          0
price             0
quantity          0
total_amount      0
dtype: int64


In [7]:
# Count the number of duplicate rows
print(data.duplicated().sum())

0
