In [2]:
import polars as pl

## Sample Data Creation

In [3]:
data = {
    "id": [1, 2, 3, 4, 5, 1],
    "product customer purchased": ["Widget A", "Widget B", "Widget C", "Widget D", "Widget E", "Widget A"],
    "Name of customer": ["John Doe", "Jane Smith", "Bob Johnson", "Susan Brown", "Mary Wilson", "John Doe"],
    "address of customer": ["123 Main St", "456 Elm St", "789 Oak St", "234 Pine St", "567 Birch St", "123 Main St"],
    "email of customer": ["john.doe@email.com", "", "bob.johnson@email.com", "susan.brown@email.com", "mary.wilson@email.com", "john.doe@email.com"],
    "phone number of customer": ["123-456-7890", "987-654-3210", "555-123-7890", "777-888-9999", "111-222-3333", "123-456-7890"],
    "time of purchase": ["2023-08-10 09:15:00", "2023-08-10 10:30:00", "2023-08-11 14:45:00", "2023-08-11 15:30:00", "2023-08-12 11:20:00", "2023-08-10 09:15:00"]
}

In [4]:
df = pl.DataFrame(data)

In [5]:
df

id,product customer purchased,Name of customer,address of customer,email of customer,phone number of customer,time of purchase
i64,str,str,str,str,str,str
1,"""Widget A""","""John Doe""","""123 Main St""","""john.doe@email…","""123-456-7890""","""2023-08-10 09:…"
2,"""Widget B""","""Jane Smith""","""456 Elm St""","""""","""987-654-3210""","""2023-08-10 10:…"
3,"""Widget C""","""Bob Johnson""","""789 Oak St""","""bob.johnson@em…","""555-123-7890""","""2023-08-11 14:…"
4,"""Widget D""","""Susan Brown""","""234 Pine St""","""susan.brown@em…","""777-888-9999""","""2023-08-11 15:…"
5,"""Widget E""","""Mary Wilson""","""567 Birch St""","""mary.wilson@em…","""111-222-3333""","""2023-08-12 11:…"
1,"""Widget A""","""John Doe""","""123 Main St""","""john.doe@email…","""123-456-7890""","""2023-08-10 09:…"


In [6]:
df.schema

OrderedDict([('id', Int64),
             ('product customer purchased', String),
             ('Name of customer', String),
             ('address of customer', String),
             ('email of customer', String),
             ('phone number of customer', String),
             ('time of purchase', String)])

In [7]:
df.null_count()

id,product customer purchased,Name of customer,address of customer,email of customer,phone number of customer,time of purchase
u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0


## Getting Unique Records Aggregating Duplicate Records into a Single Record

In [8]:
unique_df = df.unique()

In [9]:
unique_df.sort("id")

id,product customer purchased,Name of customer,address of customer,email of customer,phone number of customer,time of purchase
i64,str,str,str,str,str,str
1,"""Widget A""","""John Doe""","""123 Main St""","""john.doe@email…","""123-456-7890""","""2023-08-10 09:…"
2,"""Widget B""","""Jane Smith""","""456 Elm St""","""""","""987-654-3210""","""2023-08-10 10:…"
3,"""Widget C""","""Bob Johnson""","""789 Oak St""","""bob.johnson@em…","""555-123-7890""","""2023-08-11 14:…"
4,"""Widget D""","""Susan Brown""","""234 Pine St""","""susan.brown@em…","""777-888-9999""","""2023-08-11 15:…"
5,"""Widget E""","""Mary Wilson""","""567 Birch St""","""mary.wilson@em…","""111-222-3333""","""2023-08-12 11:…"


In [3]:
csv_df = pl.read_csv("./data/1/sample_data_1.csv")

In [14]:
csv_df

Name,Email,Phone,Address,City,Country,Date of Birth,Salary
str,str,str,str,str,str,str,i64
"""Lisa West""","""andrew23@examp…","""(465)852-8470""","""Unit 5183 Box …","""Perezfort""","""Honduras""","""2001-08-05""",83302
"""Jason Cole""","""kenneth05@exam…","""7783540568""","""73784 Kenneth …","""Port Andrea""","""Bolivia""","""2012-12-04""",31035
"""Juan Crosby""","""zbarnes@exampl…","""(676)328-6244x…","""0580 Mary Row …","""Johnborough""","""South Africa""","""1921-02-02""",60226
"""April Ball""","""michaelmckinne…","""+1-633-685-490…","""6215 Hensley P…","""New Eric""","""Greece""","""2023-05-08""",52502
"""Donald Jackson…","""peterdean@exam…","""779-437-7643""","""156 Macdonald …","""Alyssaborough""","""Namibia""","""2019-03-07""",38687
…,…,…,…,…,…,…,…
"""Willie Shelton…","""jamesdavis@exa…","""277-426-9077x8…","""2277 Kathryn C…","""East Michael""","""Mongolia""","""2017-05-20""",59788
"""Deborah Nichol…","""michelle15@exa…","""954-765-9968x1…","""3179 Young Bro…","""Port Linda""","""Senegal""","""1963-07-25""",61230
"""Vanessa Wu""","""reginaldowens@…","""+1-682-798-156…","""1820 Denise Fo…","""Cassandraview""","""Israel""","""1929-01-18""",70482
"""Robyn Allen""","""chad33@example…","""212-239-3934x5…","""1162 Kathryn E…","""Bruceberg""","""Iran""","""1941-05-17""",73249


### Confirming Duplicate Records

In [23]:
duplicated_records = csv_df.filter(csv_df.is_duplicated())

In [32]:
filtered_records = duplicated_records.filter(pl.col("Name") == "Scott Stewart")

In [37]:
for row in filtered_records.rows():
    print(f"This is the row type {type(row)}")
    print(row)
    print("=================================")

This is the row type <class 'tuple'>
('Scott Stewart', 'valerie39@example.com', '956.897.0540', '228 Adrian Pine\nNorth Sean, IL 20499', 'Alexandraside', 'New Zealand', '2019-09-30', 81174)
This is the row type <class 'tuple'>
('Scott Stewart', 'valerie39@example.com', '956.897.0540', '228 Adrian Pine\nNorth Sean, IL 20499', 'Alexandraside', 'New Zealand', '2019-09-30', 81174)
This is the row type <class 'tuple'>
('Scott Stewart', 'valerie39@example.com', '956.897.0540', '228 Adrian Pine\nNorth Sean, IL 20499', 'Alexandraside', 'New Zealand', '2019-09-30', 81174)
This is the row type <class 'tuple'>
('Scott Stewart', 'valerie39@example.com', '956.897.0540', '228 Adrian Pine\nNorth Sean, IL 20499', 'Alexandraside', 'New Zealand', '2019-09-30', 81174)
This is the row type <class 'tuple'>
('Scott Stewart', 'valerie39@example.com', '956.897.0540', '228 Adrian Pine\nNorth Sean, IL 20499', 'Alexandraside', 'New Zealand', '2019-09-30', 81174)


### Creating a Unique Dataframe that aggregates duplicates into One Record

In [12]:
unique_csv_df = csv_df.unique()

In [15]:
unique_csv_df

Name,Email,Phone,Address,City,Country,Date of Birth,Salary
str,str,str,str,str,str,str,i64
"""Scott Stewart""","""valerie39@exam…","""956.897.0540""","""228 Adrian Pin…","""Alexandraside""","""New Zealand""","""2019-09-30""",81174
"""Deborah Nichol…","""michelle15@exa…","""954-765-9968x1…","""3179 Young Bro…","""Port Linda""","""Senegal""","""1963-07-25""",61230
"""Andrea Brooks""","""billy12@exampl…","""869.593.8622""","""1989 Blake Pas…","""South Steven""","""Azerbaijan""","""1985-04-30""",55049
"""Amanda Haney""","""armstrongmark@…","""848-437-9475""","""98549 Jordan E…","""Lake Nicole""","""Tokelau""","""1957-01-31""",56752
"""Crystal Freema…","""christineberry…","""321-515-6479x4…","""744 Jamie Curv…","""New Edwardview…","""Algeria""","""1918-10-21""",85768
…,…,…,…,…,…,…,…
"""Cameron Reed""","""imendoza@examp…","""(816)923-0541x…","""3156 Lindsey G…","""Jenniferfurt""","""Tajikistan""","""1958-08-21""",41460
"""Mary Wilson""","""emily40@exampl…","""(299)828-6880""","""862 Kimberly R…","""Port Michellev…","""Benin""","""1972-06-29""",83069
"""Michelle Steph…","""fernandezcharl…","""893.974.4623x3…","""12161 Jason St…","""Clarkburgh""","""Fiji""","""1911-06-19""",91835
"""Katherine Barr…","""john54@example…","""(560)427-5037""","""8885 Farley We…","""West Victoria""","""Bahrain""","""1947-06-21""",75146
