In [1]:
import pandas as pd
import re
from datetime import datetime
import ast
from fuzzywuzzy import process, fuzz

import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")



In [2]:
DATA_DIR = './library-data'

In [3]:
bks = pd.read_csv(f'{DATA_DIR}/books.csv')
cmrs = pd.read_csv(f'{DATA_DIR}/customers.csv')
lbrs = pd.read_csv(f'{DATA_DIR}/libraries.csv')

ckts = pd.read_csv(f'{DATA_DIR}/checkouts_processed.csv')

In [4]:
ckts.rename(columns={'id':'checkout_id', 'patron_id':'customer_id'}, inplace=True)
ckts.sample(3)

Unnamed: 0,checkout_id,customer_id,library_id,days_out_final,is_late
1422,V74zAQAAMAAJ,639c94b52fd7e94e80a51a8fa41e3fea,23v-222@5xc-jv7-v4v,15,0
1379,vYOofF63aBoC,414fc3d0ff4d938370ff6f85ecda28a6,226-222@5xc-kc4-fpv,9,0
1429,Q14YAAAAYAAJ,66cab1cf0592c25ae0cda7970e2ac4de,zzw-224@5xc-jwv-2rk,18,0


In [5]:
bks = bks.rename(columns=lambda x: f'book_{x}' if x != 'id' else x)
lbrs = lbrs.rename(columns=lambda x: f'library_{x}' if x != 'id' else x)
cmrs = cmrs.rename(columns=lambda x: f'customer_{x}' if x != 'id' else x)

In [6]:
bks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  240 non-null    object
 1   book_title          240 non-null    object
 2   book_authors        173 non-null    object
 3   book_publisher      94 non-null     object
 4   book_publishedDate  238 non-null    object
 5   book_categories     201 non-null    object
 6   book_price          238 non-null    object
 7   book_pages          240 non-null    object
dtypes: object(8)
memory usage: 15.1+ KB


In [7]:
bks.book_authors = bks.book_authors.fillna("['Unknown']")
bks.book_publisher = bks.book_publisher.fillna('Unknown')
bks.book_publishedDate = bks.book_publishedDate.fillna('YYYY-MM-DD')
bks.book_categories = bks.book_categories.fillna("['Unknown']")
bks.book_price = bks.book_price.fillna(-1.0)

In [8]:
bks.book_authors = bks.book_authors.apply(ast.literal_eval)
bks.book_categories = bks.book_categories.apply(ast.literal_eval)

In [9]:
def get_unique_list_items(series):
    from itertools import chain
    all_items = set(chain.from_iterable(series.dropna()))
    return sorted(all_items)

print(get_unique_list_items(bks.book_authors)[:3])
print(get_unique_list_items(bks.book_categories)[:3])

['A. Kungolos', 'Ahmed F. El-Sayed', 'Akira Ohata']
['Accounting', 'Administrative agencies', 'Advertising']


In [10]:
def clean_fuzzy_column(series, similarity_threshold=90, min_freq=2):

    def normalize_item(item):
        return item.lower().strip().replace('-', ' ').replace('_', ' ')

    if isinstance(series.dropna().iloc[0], list):
        flat_items = pd.Series(
            [normalize_item(i) for sublist in series.dropna() for i in sublist]
        )
        return_type = "list"
    else:
        flat_items = series.dropna().astype(str).apply(normalize_item)
        return_type = "str"

    unique_items = flat_items.value_counts()
    # Filter to items above min_freq to avoid mapping junk
    items_to_check = unique_items[unique_items >= min_freq].index.tolist()

    canonical_map = {}
    processed = set()

    for item in items_to_check:
        if item in processed:
            continue
        matches = process.extract(item, items_to_check, scorer=fuzz.token_sort_ratio)
        group = [match for match, score in matches if score >= similarity_threshold]
        canonical = min(group, key=len)
        for alt in group:
            canonical_map[alt] = canonical
            processed.add(alt)

    if return_type == "list":
        def clean_list(lst):
            if not isinstance(lst, list):
                return []
            normalized = [normalize_item(i) for i in lst]
            cleaned = [canonical_map.get(i, i) for i in normalized]
            return sorted(set(cleaned))
        cleaned_series = series.apply(clean_list)

    else:
        def clean_str(s):
            s_norm = normalize_item(s)
            return canonical_map.get(s_norm, s_norm)
        cleaned_series = series.dropna().astype(str).apply(clean_str)

    return cleaned_series, canonical_map

In [11]:
bks['book_authors'], author_mapping = clean_fuzzy_column(bks['book_authors'])
bks['book_categories'], category_mapping = clean_fuzzy_column(bks['book_categories'])

print(author_mapping)

{'unknown': 'unknown', 'philip reeve': 'philip reeve', 'khan': 'khan', 'world bank': 'world bank', 'charles eucharist de medicis sajous': 'charles eucharist de medicis sajous'}


In [12]:
bks['book_title'], title_mapping = clean_fuzzy_column(bks['book_title'])

print(title_mapping)

{'popular mechanics': 'popular mechanics', 'medicine': 'medicine', 'advertising management': 'advertising management', 'financial management': 'financial management', 'advertising and sales promotion': 'advertising and sales promotion', 'advertising': 'advertising', 'mortal engines': 'mortal engines', 'water resources data': 'water resources data', 'water resources paper': 'water resources paper'}


In [13]:
def clean_book_prices(bks):
    def try_convert(price):
        cleaned = re.findall(r"\d+\.\d+|\d+", str(price))
        if cleaned:
            return float(cleaned[0])
        return None

    bks['book_price_clean'] = bks['book_price'].apply(try_convert)
    bks['book_price_error'] = bks['book_price_clean'].isnull()

    return bks

In [14]:
bks = clean_book_prices(bks)

bks['book_price_error'].value_counts()

book_price_error
False    240
Name: count, dtype: int64

In [15]:
bks = bks.drop(columns=['book_price', 'book_price_error'])
bks = bks.rename(columns={'book_price_clean':'book_price'})
bks['book_price'].isna().sum()

0

In [16]:
def clean_book_pages(bks):
    def try_convert(page):
        cleaned = re.sub(r'\D', '', str(page))
        return int(cleaned) if cleaned.isdigit() else None

    bks['book_pages_clean'] = bks['book_pages'].apply(try_convert)

    bks['book_pages_error'] = bks['book_pages_clean'].isnull()

    return bks

In [17]:
bks = clean_book_pages(bks)

bks.book_pages_error.value_counts()

book_pages_error
False    240
Name: count, dtype: int64

In [18]:
bks = bks.drop(columns=['book_pages', 'book_pages_error'])
bks = bks.rename(columns={'book_pages_clean':'book_pages'})
bks['book_pages'].isna().sum()

0

In [19]:
def extract_year(val):
    if pd.isna(val):
        return None
    match = re.search(r'\b(18|19|20)\d{2}\b', str(val))
    if match:
        return int(match.group())
    return None

current_year = datetime.today().year

bks['book_published_year'] = bks['book_publishedDate'].apply(extract_year)

bks['book_age'] = bks['book_published_year'].apply(
    lambda y: current_year - y if pd.notnull(y) else None
)

bks['book_age'].value_counts()

book_age
21.0     9
10.0     8
16.0     7
18.0     7
13.0     6
        ..
166.0    1
144.0    1
149.0    1
148.0    1
46.0     1
Name: count, Length: 113, dtype: int64

In [20]:
book_age_categories = {
    "New": (0, 1),                  
    "Recent": (2, 5),               
    "Modern": (6, 20),              
    "Vintage": (21, 50),            
    "Classic": (51, float('inf'))   
}

In [21]:
def categorize_book_age(age):
    if age is None:
        return 'Unknown'
    for category, (min_age, max_age) in book_age_categories.items():
        if min_age <= age <= max_age:
            return category
    return 'Unknown'

bks['book_age_category'] = bks['book_age'].apply(categorize_book_age)
bks['book_age_category'].value_counts()

book_age_category
Classic    113
Modern      65
Vintage     56
Recent       4
Unknown      2
Name: count, dtype: int64

In [22]:
bks.book_age = bks.book_age.fillna(-1)
bks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   240 non-null    object 
 1   book_title           240 non-null    object 
 2   book_authors         240 non-null    object 
 3   book_publisher       240 non-null    object 
 4   book_publishedDate   240 non-null    object 
 5   book_categories      240 non-null    object 
 6   book_price           240 non-null    float64
 7   book_pages           240 non-null    int64  
 8   book_published_year  238 non-null    float64
 9   book_age             240 non-null    float64
 10  book_age_category    240 non-null    object 
dtypes: float64(3), int64(1), object(7)
memory usage: 20.8+ KB


In [23]:
bks = bks.drop(columns=['book_publishedDate', 'book_published_year'])
bks.sample(3)

Unnamed: 0,id,book_title,book_authors,book_publisher,book_categories,book_price,book_pages,book_age,book_age_category
109,gM80AQAAMAAJ,medicine in modern times,"[british medical association, william stokes]",Unknown,[historical card],149.99,479,156.0,Classic
233,ck-32n6REQgC,the john h. chafee coastal barrier resources s...,"[united states, united states. congress. house...",Unknown,[barrier islands],154.99,442,21.0,Vintage
120,IwsVEAAAQBAJ,financial inclusion,[rajiv prabhakar],Policy Press,[finance],113.0,353,4.0,Recent


In [24]:
bks.to_csv(f'{DATA_DIR}/books_processed.csv', index=False)

In [25]:
lbrs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      18 non-null     object
 1   library_name            18 non-null     object
 2   library_street_address  18 non-null     object
 3   library_city            14 non-null     object
 4   library_region          16 non-null     object
 5   library_postal_code     15 non-null     object
dtypes: object(6)
memory usage: 992.0+ bytes


In [26]:
lbrs.library_city.unique()

array(['Portland ', nan, 'portland', 'Portland', ' Portland', 'PORTLAND'],
      dtype=object)

In [27]:
lbrs.library_city = 'Portland'

In [28]:
lbrs.library_region.unique()

array(['OR ', 'or', 'OR', ' OR', nan], dtype=object)

In [29]:
lbrs.library_region = 'OR'

In [30]:
lbrs.library_postal_code.unique()

array(['97219', nan, '97203', '-97239', ' 97202', '-97206', '97205 ',
       '97204', '97214', '#97233', '97212-', '#97217', '97212 ', '97212_',
       '_97213'], dtype=object)

In [31]:
lbrs[lbrs.library_postal_code.isna()]

Unnamed: 0,id,library_name,library_street_address,library_city,library_region,library_postal_code
1,23v-222@5xc-jv7-v4v,Multnomah County Library Northwest,2300 NW Thurman St,Portland,OR,
9,zzw-222@5xc-knn-c5z,Multnomah County Library Holgate,7905 SE Holgate Blvd,Portland,OR,
12,225-222@5xc-jtz-hkf,MULTNOMAH County Library,216 ne Knott st,Portland,OR,


In [32]:
# These values can be filled by simple internet search

lbrs.loc[1, 'library_postal_code'] = '97210'
lbrs.loc[9, 'library_postal_code'] = '97206'
lbrs.loc[12, 'library_postal_code'] = '97212'

In [33]:
def clean_digit_string_column(df, column_name, new_column_name=None, add_error_column=True):

    if new_column_name is None:
        new_column_name = f"{column_name}_clean"

    def clean_value(val):
        if pd.isna(val):
            return None
        digits_only = re.sub(r'\D', '', str(val))
        return digits_only if digits_only else None

    df[new_column_name] = df[column_name].apply(clean_value)

    if add_error_column:
        error_col = f"{column_name}_error"
        df[error_col] = df[new_column_name].isnull()

    return df

In [34]:
lbrs = clean_digit_string_column(lbrs, 'library_postal_code', 'library_postal_code_clean', True)
lbrs.library_postal_code_error.value_counts()

library_postal_code_error
False    18
Name: count, dtype: int64

In [35]:
lbrs = lbrs.drop(columns=['library_postal_code', 'library_postal_code_error'])
lbrs = lbrs.rename(columns={'library_postal_code_clean':'library_postal_code'})
lbrs['library_postal_code'].isna().sum()

0

In [36]:
lbrs.to_csv(f'{DATA_DIR}/libraries_processed.csv', index=False)

In [37]:
cmrs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       2000 non-null   object
 1   customer_name            2000 non-null   object
 2   customer_street_address  2000 non-null   object
 3   customer_city            1906 non-null   object
 4   customer_state           1898 non-null   object
 5   customer_zipcode         1999 non-null   object
 6   customer_birth_date      1893 non-null   object
 7   customer_gender          1899 non-null   object
 8   customer_education       1896 non-null   object
 9   customer_occupation      1893 non-null   object
dtypes: object(10)
memory usage: 156.4+ KB


In [38]:
cmrs.customer_city.unique()

array(['Portland ', ' Portland', 'Portland', nan, 'Happy Valley',
       'Lake Oswego', 'Vancouver', 'Beaverton', 'PORTLAND', 'Gresham',
       'West Linn', 'Vancouver ', 'Tualatin', 'Oregon City', 'Damascus',
       'portland', 'Clackamas', ' Vancouver', 'Hillsboro', 'Lake  Oswego',
       'Lake OSWEGO', 'Happy  Valley', 'Lake   Oswego', 'VANCOUVER',
       'vancouver', 'Gladstone', 'BEAVERTON', 'HAPPY Valley', 'CLACKAMAS',
       'LAKE OSWEGO', 'Happy  Valley ', ' Beaverton', 'Happy Valley '],
      dtype=object)

In [39]:
cmrs.customer_city.isna().sum()

94

In [40]:
def clean_text_column(df, column_name, new_column_name=None, add_error_column=True):

    if new_column_name is None:
        new_column_name = f"{column_name}_clean"

    def clean_value(val):
        if pd.isna(val) or not str(val).strip() or not re.search(r'[a-zA-Z]', str(val)):
            return 'Unknown'
        return re.sub(r'\s+', ' ', str(val)).strip().title()

    df[new_column_name] = df[column_name].apply(clean_value)

    if add_error_column:
        error_col = f"{column_name}_error"
        df[error_col] = df[new_column_name] == 'Unknown'

    return df

In [41]:
cmrs = clean_text_column(cmrs, 'customer_city', 'customer_city_clean', True)
cmrs['customer_city_clean'].unique()

array(['Portland', 'Unknown', 'Happy Valley', 'Lake Oswego', 'Vancouver',
       'Beaverton', 'Gresham', 'West Linn', 'Tualatin', 'Oregon City',
       'Damascus', 'Clackamas', 'Hillsboro', 'Gladstone'], dtype=object)

In [42]:
cmrs = cmrs.drop(columns=['customer_city', 'customer_city_error'])
cmrs = cmrs.rename(columns={'customer_city_clean':'customer_city'})
cmrs['customer_city'].isna().sum()

0

In [43]:
cmrs.customer_state.unique()

array(['Oregon', 'OREGON', 'Washington', nan, 'Oregon ', ' Oregon',
       ' Washington', 'oregon', 'Washington ', 'washington'], dtype=object)

In [44]:
cmrs = clean_text_column(cmrs, 'customer_state', 'customer_state_clean', True)
cmrs['customer_state_clean'].unique()

array(['Oregon', 'Washington', 'Unknown'], dtype=object)

In [45]:
cmrs = cmrs.drop(columns=['customer_state', 'customer_state_error'])
cmrs = cmrs.rename(columns={'customer_state_clean':'customer_state'})
cmrs['customer_state'].isna().sum()

0

In [46]:
cmrs.customer_zipcode.unique()[20:50]

array(['97211.0', '97219.0', '97035.0 ', '97203.0', '97216.0', '97045.0',
       '#97034.0', '97202.0', '97267.0', '_97211.0', '97202.0-',
       '97201.0', '97212.0', '97266.0', '97080.0', '97225.0', '97214.0_',
       ' 97202.0', '97080.0#', '97227.0#', '#97211.0', '97008.0',
       ' 97239.0', '97220.0 ', '97035.0', '97221.0', ' 97206.0',
       '97005.0', '97239.0', '97236.0'], dtype=object)

In [47]:
cmrs = clean_digit_string_column(cmrs, 'customer_zipcode', 'customer_zipcode_clean', True)
cmrs.customer_zipcode_error.value_counts()

customer_zipcode_error
False    1999
True        1
Name: count, dtype: int64

In [48]:
cmrs[cmrs.customer_zipcode_clean.isna()]

Unnamed: 0,id,customer_name,customer_street_address,customer_zipcode,customer_birth_date,customer_gender,customer_education,customer_occupation,customer_city,customer_state,customer_zipcode_clean,customer_zipcode_error
1728,21d1bb3a22811b8d490076e81288dfdb,Kendrick Luciano,822 SE Main St,,2109-06-22,male,Others,Admin & Support,Portland,Oregon,,True


In [49]:
# This value can also be filled by street adress and internet search

cmrs.loc[1728, 'customer_zipcode_clean'] = '97214'
cmrs[cmrs.customer_zipcode_clean.isna()]

Unnamed: 0,id,customer_name,customer_street_address,customer_zipcode,customer_birth_date,customer_gender,customer_education,customer_occupation,customer_city,customer_state,customer_zipcode_clean,customer_zipcode_error


In [50]:
cmrs = cmrs.drop(columns=['customer_zipcode', 'customer_zipcode_error'])
cmrs = cmrs.rename(columns={'customer_zipcode_clean':'customer_zipcode'})
cmrs['customer_zipcode'].isna().sum()

0

In [51]:
cmrs.customer_occupation.unique()[:10]

array([nan, 'Blue Collar', 'Education & Health', 'SALES', 'Tech', 'Sales',
       'Others', 'Business & Finance', ' Business &  Finance ',
       'Education  & Health'], dtype=object)

In [52]:
cmrs = clean_text_column(cmrs, 'customer_occupation', 'customer_occupation_clean', True)
cmrs['customer_occupation_clean'].unique()

array(['Unknown', 'Blue Collar', 'Education & Health', 'Sales', 'Tech',
       'Others', 'Business & Finance', 'Admin & Support'], dtype=object)

In [53]:
cmrs = cmrs.drop(columns=['customer_occupation', 'customer_occupation_error'])
cmrs = cmrs.rename(columns={'customer_occupation_clean':'customer_occupation'})
cmrs['customer_occupation'].isna().sum()

0

In [54]:
cmrs = clean_text_column(cmrs, 'customer_education', 'customer_education_clean', True)
cmrs['customer_education_clean'].unique()

array(['High School', 'College', 'Graduate Degree', 'Others', 'Unknown'],
      dtype=object)

In [55]:
cmrs = cmrs.drop(columns=['customer_education', 'customer_education_error'])
cmrs = cmrs.rename(columns={'customer_education_clean':'customer_education'})
cmrs['customer_education'].isna().sum()

0

In [56]:
cmrs = clean_text_column(cmrs, 'customer_gender', 'customer_gender_clean', True)
cmrs['customer_gender_clean'].unique()

array(['Female', 'Male', 'Unknown'], dtype=object)

In [57]:
cmrs = cmrs.drop(columns=['customer_gender', 'customer_gender_error'])
cmrs = cmrs.rename(columns={'customer_gender_clean':'customer_gender'})
cmrs['customer_gender'].isna().sum()

0

In [58]:
cmrs.customer_birth_date = cmrs.customer_birth_date.fillna('YYYY-MM-DD')
cmrs.customer_birth_date.value_counts()

customer_birth_date
YYYY-MM-DD    107
1956-12-15      3
1972-06-02      3
1956-06-23      3
1975-08-27      2
             ... 
2116-08-05      1
1974-03-21      1
1983-05-19      1
1996-03-16      1
2001-02-19      1
Name: count, Length: 1831, dtype: int64

In [59]:
def clean_customer_birth_date(date_val):
    if pd.isna(date_val):
        return None

    date_str = str(date_val).strip()

    date_str = re.sub(r'^[^\w]+|[^\w]+$', '', date_str)

    try:
        parsed_date = pd.to_datetime(date_str, errors='coerce', dayfirst=True)
        if pd.notnull(parsed_date):
            return parsed_date.strftime('%Y-%m-%d')
    except Exception:
        pass

    if re.fullmatch(r'\d{8}', date_str):
        try:
            parsed = pd.to_datetime(f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}")
            return parsed.strftime('%Y-%m-%d')
        except Exception:
            pass

    date_str = re.sub(r'[^\w]', '-', date_str)
    try:
        parsed_date = pd.to_datetime(date_str, errors='coerce', dayfirst=True)
        if pd.notnull(parsed_date):
            return parsed_date.strftime('%Y-%m-%d')
    except Exception:
        pass

    return None

In [60]:
clean_customer_birth_date('1956-07-19'), clean_customer_birth_date('1956-07-19 ')

  parsed_date = pd.to_datetime(date_str, errors='coerce', dayfirst=True)


('1956-07-19', '1956-07-19')

In [61]:
cmrs['customer_birth_date_cleaned_str'] = cmrs['customer_birth_date'].apply(clean_customer_birth_date)

cmrs['customer_birth_date_parsed'] = pd.to_datetime(
    cmrs['customer_birth_date_cleaned_str'],
    errors='coerce'
)

cmrs['customer_birth_date_clean'] = cmrs['customer_birth_date_parsed'].dt.strftime('%Y-%m-%d')
cmrs['customer_birth_date_cleaned_str'][1991], cmrs['customer_birth_date_parsed'][1991]

  parsed_date = pd.to_datetime(date_str, errors='coerce', dayfirst=True)


('1956-07-19', Timestamp('1956-07-19 00:00:00'))

In [62]:
# Using standard categorization of age

age_categories = {
    "Children": (6, 12),
    "Adolescents/Teenagers": (13, 17),
    "Young Adults": (18, 24),
    "Young Professionals": (25, 34),
    "Adults": (35, 44),
    "Middle-aged Adults": (45, 54),
    "Older Adults": (55, 64),
    "Seniors": (65, float('inf'))
}

In [63]:
today = pd.to_datetime(datetime.today().date())
cmrs['customer_age'] = cmrs['customer_birth_date_parsed'].apply(
    lambda dob: int((today - dob).days / 365.25) if pd.notnull(dob) else None
)
def categorize_age(age):
    if age is None:
        return 'Unknown'
    for category, (min_age, max_age) in age_categories.items():
        if min_age <= age <= max_age:
            return category
    return 'Unknown'

cmrs['customer_age_category'] = cmrs['customer_age'].apply(categorize_age)
cmrs['customer_age_category'].value_counts()

customer_age_category
Seniors                  370
Older Adults             300
Middle-aged Adults       297
Young Professionals      279
Adults                   277
Unknown                  201
Young Adults             193
Adolescents/Teenagers     83
Name: count, dtype: int64

In [64]:
cmrs = cmrs.drop(columns=['customer_birth_date',
       'customer_birth_date_cleaned_str', 'customer_birth_date_parsed'])
cmrs.columns

Index(['id', 'customer_name', 'customer_street_address', 'customer_city',
       'customer_state', 'customer_zipcode', 'customer_occupation',
       'customer_education', 'customer_gender', 'customer_birth_date_clean',
       'customer_age', 'customer_age_category'],
      dtype='object')

In [65]:
cmrs = cmrs.rename(columns={'customer_birth_date_clean': 'customer_birth_date'})
cmrs.customer_birth_date = cmrs.customer_birth_date.fillna('YYYY-MM-DD')
cmrs.customer_age = cmrs.customer_age.fillna(-1)
cmrs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       2000 non-null   object 
 1   customer_name            2000 non-null   object 
 2   customer_street_address  2000 non-null   object 
 3   customer_city            2000 non-null   object 
 4   customer_state           2000 non-null   object 
 5   customer_zipcode         2000 non-null   object 
 6   customer_occupation      2000 non-null   object 
 7   customer_education       2000 non-null   object 
 8   customer_gender          2000 non-null   object 
 9   customer_birth_date      2000 non-null   object 
 10  customer_age             2000 non-null   float64
 11  customer_age_category    2000 non-null   object 
dtypes: float64(1), object(11)
memory usage: 187.6+ KB


In [66]:
cmrs.to_csv(f'{DATA_DIR}/customers_processed.csv', index=False)