# Data Preparation
This notebook was written to organise customer information and store it in a CSV file. A CSV file is much more useful if one wants to organise information and conduct data analysis.

*Note:* The output has been cleared out for data protection reasons.
## 1.Data Extraction

In [None]:
import csv

# Define the input and output file paths
input_file = "25feb_29Apr.txt"
output_file = "25feb_29Apr.csv"

In [None]:
# Open the input file and read the lines
with open(input_file, "r+", encoding="utf-8") as file:
    lines = file.readlines()
print(lines)

# Initialize an empty list to store the modified lines
formatted_lines = []

In [None]:
# Loop through each line in the input file
for line in lines:
    line = line.strip()
    # If the line is not empty, append it to the list
    if line:
        # Replace newlines with commas
        line = line.replace(",", "")
        line = line.replace("\n", "")
        line = line.strip()
        # Append the modified line to the list
        formatted_lines.append(line)


print(formatted_lines)
print(",".join(formatted_lines))

In [None]:
formatted_lines = ",".join(formatted_lines).split("Retail Sale,Register")
del formatted_lines[0]
formatted_lines

In [None]:
# Initialize an empty list to store the data
data = []

# Loop through each line in the input file
for line in formatted_lines:
    # Split the line into individual pieces of data
    pieces = line.split(",")
    customer = pieces[1].replace("Customer:", "").strip()
    total_sale = (
        pieces[2]
        .replace("E", "")
        .replace("￡", "")
        .replace(":", "")
        .replace("Total Sale", "")
        .strip()
    )
    # Append the data to the list
    data.append([customer, total_sale])

print(data)

# Write the data to a CSV file
with open(output_file, "w", newline="", encoding="utf-8-sig") as file:
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(["customer", "total_sale"])
    # Write the data rows
    writer.writerows(data)

# 2. Data Cleaning and Preprocessing

In [None]:
!pip install numpy
!pip install pandas 
import numpy as np
import pandas as pd

df = pd.read_csv("25feb_29Apr.csv")

# Display the DataFrame
print(df)

*Deletion: Remove records with missing values*

In [None]:
df = df.dropna()

*Handling Dulicates*

In [None]:
# Show all duplicated rows
df[df.duplicated(keep="last")]

In [None]:
# Romove duplicate records
df = df.drop_duplicates()

*Formatting*

In [None]:
df["customer"] = df["customer"].str.title()
df["total_sale"] = df["total_sale"].astype(float)

# 3. Data Enrichment

In [None]:
df["no_item"] = 0
df["boy_age"] = 0
df["girl_age"] = 0
df["price_sensetivity"] = 0
df["other"] = 0

df.head()

# 4. Export the CSV


In [None]:
df.to_csv("example.csv", index=False)