In [138]:
# import the required libraries
import numpy as np
import pandas as pd
import re
from pandas import read_csv
# set the path to the data files
pcard2014_name = 'Analytics_mindset_case_studies_PCard_FY2014.csv'
pcard2015_name = 'Analytics_mindset_case_studies_PCard_FY2015.csv'

# set desired column names
names = {
    'AgencyNumber': 'AgencyNumber',
    'AgencyName': 'AgencyName',
    'CardholderLastName': 'CardholderLastName',
    'CardholderFirstInitial': 'CardholderFirstInitial',
    'Description': 'Description',
    'Amount': 'Amount',
    'Vendor': 'Vendor',
    'TransactionDate': 'TransactionDate',
    'PostedDate': 'PostedDate',
    'MCC': 'MCC'
}

# read the data files into pandas dataframes
pcard2014 = read_csv(pcard2014_name, header=0, names=names, encoding='UTF-8')
pcard2015 = read_csv(pcard2015_name, header=0, names=names, encoding='UTF-8')

  pcard2015 = read_csv(pcard2015_name, header=0, names=names, encoding='UTF-8')


In [139]:
# Concatenating the two dataframes
pcard = pd.concat([pcard2014, pcard2015])

In [140]:
# Resetting the index to avoid duplicate index values
# drop=True avoids the old index being added as a column
# inplace=True modifies the DF in place without creating a new object
pcard.reset_index(drop=True, inplace=True) 

In [141]:
pcard.shape

(880037, 10)

In [142]:
pcard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880037 entries, 0 to 880036
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   AgencyNumber            880037 non-null  int64 
 1   AgencyName              880037 non-null  object
 2   CardholderLastName      880037 non-null  object
 3   CardholderFirstInitial  880037 non-null  object
 4   Description             880037 non-null  object
 5   Amount                  880037 non-null  object
 6   Vendor                  880037 non-null  object
 7   TransactionDate         657156 non-null  object
 8   PostedDate              657156 non-null  object
 9   MCC                     880037 non-null  object
dtypes: int64(1), object(9)
memory usage: 67.1+ MB


## Task 3: Data  Preprocessing (Extract, Transform, and Load)

### Information Schema

| Column Name             | Data Type   | Description                       |
|--------------------------|-------------|-----------------------------------|
| **AgencyNum**           | `int`       | Unique identifier for the agency |
| **AgencyName**          | `str`       | Name of the agency               |
| **CardholderLastName**  | `str`       | Last name of the cardholder      |
| **CardholderFirstInitial** | `str`    | First initial of the cardholder  |
| **Description**         | `str`       | Description of the transaction   |
| **Amount**              | `float`     | Transaction amount               |
| **Vendor**              | `str`       | Vendor associated with the transaction |
| **TransactionDate**     | `datetime`  | Date the transaction occurred    |
| **PostedDate**          | `datetime`  | Date the transaction was posted  |
| **MCC**                 | `str`       | Merchant Category Code           |


### Tranforming TransactionDate and PostedDate

In [143]:
# Transforming the TransactionDate and PostedDate to ISO datetime format
pcard['TransactionDate'] = pd.to_datetime(pcard['TransactionDate'], format='%m/%d/%Y %H:%M')
pcard['PostedDate'] = pd.to_datetime(pcard['PostedDate'], format='%m/%d/%Y %H:%M')

### Transforming Amount

In [144]:
# Parsing amount to String
pcard['Amount'] = pcard['Amount'].astype(str)

In [145]:
# Stripping white spaces from Amount
pcard['Amount'] = pcard['Amount'].str.strip()

In [146]:
# Removing $ from Amount
pcard['Amount'] = pcard['Amount'].str.replace('$', '')

In [147]:
# Removing , from Amount
pcard['Amount'] = pcard['Amount'].str.replace(',', '')

In [148]:
# Changing () to 0 in Amount
pcard['Amount'] = pcard['Amount'].str.replace('(','-')
pcard['Amount'] = pcard['Amount'].str.replace(')','')

In [149]:
# Parsing Amount back to float
pcard['Amount'] = pcard['Amount'].astype(float)

### Mapping AgencyNumber to correct AgencyName

In [150]:
# Find AgencyNumber with multiple AgencyName
mult_agency = pcard.groupby('AgencyNumber')['AgencyName'].nunique()
mult_agency[mult_agency > 1]

AgencyNumber
4000     3
19000    2
21500    2
26500    2
30900    2
34000    2
34200    2
50900    2
58800    2
60600    2
Name: AgencyName, dtype: int64

In [151]:
# Creating mapping table for multiple AgencyNames
agency_mapping_table = {
    4000: 'DEPARTMENT OF AGRICULTURE FOOD & FORESTRY',
    19000: 'OKLAHOMA STATE BOARD OF COSMETOLOGY AND BARBERING',
    21500: 'OLKAHOMA STATE BOARD OF DENTISTRY',
    26500: 'DEPARTMENT OF EDUCATION',
    30900: 'DEPARTMENT OF EMERGENCY MANAGEMENT',
    34000: 'OKLAHOMA STATE DEPARTMENT OF HEALTH',
    34200: 'OFFICE OF THE CHIEF MEDICAL EXAMINER',
    50900: 'BOARD OF EXAM. FOR LT CARE ADMIN',
    58800: 'OKLAHOMA REAL ESTATE COMMISSION',
    60600: 'UNIVERSITY CENTER OF SOUTHERN OKLAHOMA'
}

# Replacing duplicate AgencyNames with correct AgencyName
# .map() replaces the values in pcard['AgencyName'] with the values in agency_mapping_table
# .fillna() fills in the values that are not in the mapping table with the original values
pcard['AgencyName'] = pcard['AgencyNumber'].map(agency_mapping_table).fillna(pcard['AgencyName'])

In [152]:
# Check for multiples after transformation
mult_agency = pcard.groupby('AgencyNumber')['AgencyName'].nunique()
mult_agency[mult_agency > 1]

Series([], Name: AgencyName, dtype: int64)

### Fix formatting for MCC, Description, and Vendor

In [153]:
# Function to add spaces after punctuation and stripping extra spaces
def fix_punctuation(name):
    # Add single space after punctuation if there is none
    name = re.sub(r'([,:;!?])(?=\S)', r'\1 ', name)

    # Remove extra spaces around punctuation and strip leading/trailing spaces
    name = re.sub(r'\s+([.,:;!?])', r'\1', name) # Removes spaces before punctuation
    name = re.sub(r'\s+', ' ', name).strip() # Remove extra spaces

    return name

In [154]:
# Fixes punctuation for Vendor, MCC, and Description
pcard['Vendor'] = pcard['Vendor'].apply(fix_punctuation)
pcard['MCC'] = pcard['MCC'].apply(fix_punctuation)
pcard['Description'] = pcard['Description'].apply(fix_punctuation)

In [155]:
pcard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880037 entries, 0 to 880036
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   AgencyNumber            880037 non-null  int64         
 1   AgencyName              880037 non-null  object        
 2   CardholderLastName      880037 non-null  object        
 3   CardholderFirstInitial  880037 non-null  object        
 4   Description             880037 non-null  object        
 5   Amount                  880037 non-null  float64       
 6   Vendor                  880037 non-null  object        
 7   TransactionDate         657156 non-null  datetime64[ns]
 8   PostedDate              657156 non-null  datetime64[ns]
 9   MCC                     880037 non-null  object        
dtypes: datetime64[ns](2), float64(1), int64(1), object(6)
memory usage: 67.1+ MB


### Filtering Data

**Necessary Filters:**
1. Remove NA Values in TransactionDate/PostedDate
2. Filter Agency for only OSU transactions
3. Filter TransactionDate for 2014 calendar year transactions


In [156]:
# Create a copy of the dataframe for filtering
pcard_filtered = pcard.copy()

In [157]:
pcard_filtered.shape

(880037, 10)

In [158]:
pcard_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880037 entries, 0 to 880036
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   AgencyNumber            880037 non-null  int64         
 1   AgencyName              880037 non-null  object        
 2   CardholderLastName      880037 non-null  object        
 3   CardholderFirstInitial  880037 non-null  object        
 4   Description             880037 non-null  object        
 5   Amount                  880037 non-null  float64       
 6   Vendor                  880037 non-null  object        
 7   TransactionDate         657156 non-null  datetime64[ns]
 8   PostedDate              657156 non-null  datetime64[ns]
 9   MCC                     880037 non-null  object        
dtypes: datetime64[ns](2), float64(1), int64(1), object(6)
memory usage: 67.1+ MB


In [159]:
# Step 1: Remove rows with missing TransactionDate or PostedDate
pcard_filtered = pcard_filtered.dropna(subset=['TransactionDate', 'PostedDate'])

In [160]:
# Step 2: Filter Agency to only include OSU transactions
pcard_filtered = pcard_filtered[pcard_filtered['AgencyName'] == 'OKLAHOMA STATE UNIVERSITY']

In [162]:
# Step 3: Filter TransactionDate for 2014 calendar year onnly
pcard_filtered = pcard_filtered[pcard_filtered['TransactionDate'].dt.year == 2014]

In [167]:
# Check for number of rows in filtered data
pcard_filtered.shape

(116031, 10)

In [168]:
# Check total amount spent in filtered data
pcard_filtered['Amount'].sum()

np.float64(33504148.340000004)

In [169]:
# Save the filtered data to a new CSV file
pcard_filtered.to_csv('pcard_filtered.csv', index=False)

### Final Output

- **Number of Rows**: `116031`
- **Total $ Amount**: `$33504148.34`
