In [2]:
!pip install polars



In [3]:
%run ../utils/configuration.ipynb

In [4]:

import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
import pandas as pd

In [6]:

pdf = pd.read_excel(f"{raw_path}/Telco_customer_churn_location.xlsx")
df = pl.from_pandas(pdf)

In [7]:
print(df.head(3))

shape: (3, 9)
┌────────────┬───────┬────────────┬────────────┬───┬──────────┬────────────┬───────────┬───────────┐
│ Customer   ┆ Count ┆ Country    ┆ State      ┆ … ┆ Zip Code ┆ Lat Long   ┆ Latitude  ┆ Longitude │
│ ID         ┆ ---   ┆ ---        ┆ ---        ┆   ┆ ---      ┆ ---        ┆ ---       ┆ ---       │
│ ---        ┆ i64   ┆ str        ┆ str        ┆   ┆ i64      ┆ str        ┆ f64       ┆ f64       │
│ str        ┆       ┆            ┆            ┆   ┆          ┆            ┆           ┆           │
╞════════════╪═══════╪════════════╪════════════╪═══╪══════════╪════════════╪═══════════╪═══════════╡
│ 8779-QRDMV ┆ 1     ┆ United     ┆ California ┆ … ┆ 90022    ┆ 34.02381,  ┆ 34.02381  ┆ -118.1565 │
│            ┆       ┆ States     ┆            ┆   ┆          ┆ -118.15658 ┆           ┆ 82        │
│            ┆       ┆            ┆            ┆   ┆          ┆ 2          ┆           ┆           │
│ 7495-OOKFY ┆ 1     ┆ United     ┆ California ┆ … ┆ 90063    ┆ 34.044271, ┆ 

In [8]:
df

Customer ID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude
str,i64,str,str,str,i64,str,f64,f64
"""8779-QRDMV""",1,"""United States""","""California""","""Los Angeles""",90022,"""34.02381, -118.156582""",34.02381,-118.156582
"""7495-OOKFY""",1,"""United States""","""California""","""Los Angeles""",90063,"""34.044271, -118.185237""",34.044271,-118.185237
"""1658-BYGOY""",1,"""United States""","""California""","""Los Angeles""",90065,"""34.108833, -118.229715""",34.108833,-118.229715
"""4598-XLKNJ""",1,"""United States""","""California""","""Inglewood""",90303,"""33.936291, -118.332639""",33.936291,-118.332639
"""4846-WHAFZ""",1,"""United States""","""California""","""Whittier""",90602,"""33.972119, -118.020188""",33.972119,-118.020188
…,…,…,…,…,…,…,…,…
"""2569-WGERO""",1,"""United States""","""California""","""Landers""",92285,"""34.341737, -116.539416""",34.341737,-116.539416
"""6840-RESVB""",1,"""United States""","""California""","""Adelanto""",92301,"""34.667815, -117.536183""",34.667815,-117.536183
"""2234-XADUH""",1,"""United States""","""California""","""Amboy""",92304,"""34.559882, -115.637164""",34.559882,-115.637164
"""4801-JZAZL""",1,"""United States""","""California""","""Angelus Oaks""",92305,"""34.1678, -116.86433""",34.1678,-116.86433


In [9]:
for col in df.columns:
    print(f"{col}: {df[col].n_unique()} unique values")

Customer ID: 7043 unique values
Count: 1 unique values
Country: 1 unique values
State: 1 unique values
City: 1106 unique values
Zip Code: 1626 unique values
Lat Long: 1679 unique values
Latitude: 1626 unique values
Longitude: 1625 unique values


In [10]:
df = df.drop(["Count", "Country", "State"])
#Columns dropped due to only one unique value

In [11]:
# Check if 'Customer ID' follows the pattern: 4 numbers, a dash, and 5 letters using polars
import re
pattern = r"^\d{4}-[A-Za-z]{5}$"
mask = df.select(
    (pl.col('Customer ID').str.contains(pattern) == False).alias('invalid')
)['invalid']
invalid_ids = df.filter(mask)
if invalid_ids.height == 0:
    print("No invalid Customer IDs")
else:
    print(invalid_ids['Customer ID'])

No invalid Customer IDs


In [12]:
duplicates = df.filter(df.is_duplicated())
print(duplicates)

shape: (0, 6)
┌─────────────┬──────┬──────────┬──────────┬──────────┬───────────┐
│ Customer ID ┆ City ┆ Zip Code ┆ Lat Long ┆ Latitude ┆ Longitude │
│ ---         ┆ ---  ┆ ---      ┆ ---      ┆ ---      ┆ ---       │
│ str         ┆ str  ┆ i64      ┆ str      ┆ f64      ┆ f64       │
╞═════════════╪══════╪══════════╪══════════╪══════════╪═══════════╡
└─────────────┴──────┴──────────┴──────────┴──────────┴───────────┘


In [13]:
print([repr(c) for c in df.columns])

["'Customer ID'", "'City'", "'Zip Code'", "'Lat Long'", "'Latitude'", "'Longitude'"]


In [14]:
df = df.rename({c: to_snake_case(c) for c in df.columns})

In [15]:
df.columns

['customer_id', 'city', 'zip_code', 'lat_long', 'latitude', 'longitude']

In [16]:
df.write_parquet(f'{silver_path}/location.parquet')