In [1]:
# necessary imports
import pandas as pd

In [2]:
# load dataset
file_path = 'dataset.csv'
dataset = pd.read_csv(file_path)

# Display basic information about the dataset
print("Initial Dataset Info:")
print(dataset.info())



Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   InvoiceNo    541909 non-null  object
 1   StockCode    541909 non-null  object
 2   Description  540884 non-null  object
 3   Quantity     541909 non-null  object
 4   InvoiceDate  541909 non-null  object
 5   UnitPrice    541909 non-null  object
 6   CustomerID   433909 non-null  object
 7   Country      541909 non-null  object
dtypes: object(8)
memory usage: 33.1+ MB
None


In [3]:
# Print column names
print("Column Names:")
print(dataset.columns.tolist())

Column Names:
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']


In [9]:
# Check for duplicate rows
duplicates = dataset.duplicated()
dataset = dataset.drop_duplicates()
print(f"Number of duplicate rows: {duplicates.sum()}")


Number of duplicate rows: 0


In [10]:
# Check for missing values
missing_values = dataset.isnull().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
InvoiceNo           0
StockCode           0
Description      1025
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     108000
Country             0
dtype: int64


In [11]:
# Handle missing values
dataset.fillna(value='placeholder_value', inplace=True)

# Check again to confirm missing values are handled
missing_values_after = dataset.isnull().sum()
print("Missing Values After Handling:")
print(missing_values_after)


Missing Values After Handling:
InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64


In [12]:
# Check data types before standardization
print("Data Types Before Standardization:")
print(dataset.dtypes)

Data Types Before Standardization:
InvoiceNo      object
StockCode      object
Description    object
Quantity       object
InvoiceDate    object
UnitPrice      object
CustomerID     object
Country        object
dtype: object


In [14]:
# Standardize formats
# Convert 'Quantity' and 'UnitPrice' to numeric types
dataset['Quantity'] = pd.to_numeric(dataset['Quantity'], errors='coerce')
dataset['UnitPrice'] = pd.to_numeric(dataset['UnitPrice'], errors='coerce')

# Convert 'CustomerID' to numeric type
dataset['CustomerID'] = pd.to_numeric(dataset['CustomerID'], errors='coerce')

# Convert 'InvoiceDate' to datetime type
dataset['InvoiceDate'] = pd.to_datetime(dataset['InvoiceDate'], errors='coerce')

# Check data types after standardization
print("Data Types After Standardization:")
print(dataset.dtypes)

Data Types After Standardization:
InvoiceNo              object
StockCode              object
Description            object
Quantity              float64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object


In [15]:
# Save the cleaned dataset to a new file
cleaned_file_path = 'cleaned_dataset.csv'
dataset.to_csv(cleaned_file_path, index=False)

print("Dataset cleaning and standardization complete. Cleaned dataset saved to", cleaned_file_path)

Dataset cleaning and standardization complete. Cleaned dataset saved to cleaned_dataset.csv
