In [26]:
# import the required libraries
import numpy as np # numpy for numerical operations
import pandas as pd # pandas for data manipulation
import re # re for string manipulations
from pandas import read_csv # read_csv for reading csv files

# set the path to the data files
pcard2014_name = 'Analytics_mindset_case_studies_PCard_FY2014.csv'
pcard2015_name = 'Analytics_mindset_case_studies_PCard_FY2015.csv'

# set desired column names
names = ['AgencyNumber', "AgencyName", "CardholderLastName", "CardholderFirstInitial", "Description", "Amount", "Vendor", "TransactionDate", "PostedDate", "MCC"]

# read the data files into pandas dataframes
pcard2014 = read_csv(pcard2014_name, header=0, names=names, encoding='UTF-8')
pcard2015 = read_csv(pcard2015_name, header=0, names=names, encoding='UTF-8')

  pcard2015 = read_csv(pcard2015_name, header=0, names=names, encoding='UTF-8')


In [27]:
# Concatenating the two dataframes
df_pcard = pd.concat([pcard2014, pcard2015])

In [28]:
# Resetting the index to avoid duplicate index values
# drop=True avoids the old index being added as a column
# inplace=True modifies the DF in place without creating a new object
df_pcard.reset_index(drop=True, inplace=True) 

## Task 3: Data  Preprocessing (Extract, Transform, and Load)

### Information Schema

| Column Name             | Data Type   | Description                       |
|--------------------------|-------------|-----------------------------------|
| **AgencyNum**           | `int`       | Unique identifier for the agency |
| **AgencyName**          | `str`       | Name of the agency               |
| **CardholderLastName**  | `str`       | Last name of the cardholder      |
| **CardholderFirstInitial** | `str`    | First initial of the cardholder  |
| **Description**         | `str`       | Description of the transaction   |
| **Amount**              | `float`     | Transaction amount               |
| **Vendor**              | `str`       | Vendor associated with the transaction |
| **TransactionDate**     | `datetime`  | Date the transaction occurred    |
| **PostedDate**          | `datetime`  | Date the transaction was posted  |
| **MCC**                 | `str`       | Merchant Category Code           |


In [29]:
df_pcard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880037 entries, 0 to 880036
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   AgencyNumber            880037 non-null  int64 
 1   AgencyName              880037 non-null  object
 2   CardholderLastName      880037 non-null  object
 3   CardholderFirstInitial  880037 non-null  object
 4   Description             880037 non-null  object
 5   Amount                  880037 non-null  object
 6   Vendor                  880037 non-null  object
 7   TransactionDate         657156 non-null  object
 8   PostedDate              657156 non-null  object
 9   MCC                     880037 non-null  object
dtypes: int64(1), object(9)
memory usage: 67.1+ MB


### Tranforming TransactionDate and PostedDate

In [None]:
# Transforming the TransactionDate to datetime
df_pcard['TransactionDate'] = pd.to_datetime(df_pcard['TransactionDate'], format='%m/%d/%Y %H:%M')

In [50]:
# Transforming the PostedDate to datetime
df_pcard['PostedDate'] = pd.to_datetime(df_pcard['PostedDate'], format='%m/%d/%Y %H:%M')

### Transforming Amount

In [32]:
# Parsing amount to String
df_pcard['Amount'] = df_pcard['Amount'].astype(str)

In [33]:
# Stripping white space from Amount
df_pcard['Amount'] = df_pcard['Amount'].str.strip()

In [34]:
# Removing $ from Amount
df_pcard['Amount'] = df_pcard['Amount'].str.replace('$', '')

In [35]:
# Removing , from Amount
df_pcard['Amount'] = df_pcard['Amount'].str.replace(',','')

In [36]:
# Changing () to - in Amount
df_pcard['Amount'] = df_pcard['Amount'].str.replace('(','-')
df_pcard['Amount'] = df_pcard['Amount'].str.replace(')','')

In [37]:
# Parsing Amount back to float
df_pcard['Amount'] = df_pcard['Amount'].astype(float)

In [38]:
df_pcard['Amount']

0          890.00
1          368.96
2          165.82
3           96.39
4          125.96
           ...   
880032     377.35
880033     259.16
880034     317.71
880035    7992.96
880036       9.92
Name: Amount, Length: 880037, dtype: float64

In [39]:
df_pcard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880037 entries, 0 to 880036
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   AgencyNumber            880037 non-null  int64         
 1   AgencyName              880037 non-null  object        
 2   CardholderLastName      880037 non-null  object        
 3   CardholderFirstInitial  880037 non-null  object        
 4   Description             880037 non-null  object        
 5   Amount                  880037 non-null  float64       
 6   Vendor                  880037 non-null  object        
 7   TransactionDate         657156 non-null  datetime64[ns]
 8   PostedDate              657156 non-null  datetime64[ns]
 9   MCC                     880037 non-null  object        
dtypes: datetime64[ns](2), float64(1), int64(1), object(6)
memory usage: 67.1+ MB


### Mapping AgencyNumber to correct AgencyName

In [40]:
pd.set_option('display.max_colwidth', None)  # Prevent truncation of long strings

# Check mapping of AgencyNumber to AgencyName
agency_mapping = df_pcard.groupby('AgencyNumber')['AgencyName'].unique()
multiple_agency_names = agency_mapping[agency_mapping.apply(len) > 1]
multiple_agency_names_df = pd.DataFrame(multiple_agency_names).reset_index()
multiple_agency_names_df


Unnamed: 0,AgencyNumber,AgencyName
0,4000,"[DEPARTMENT OF AGRICULTURE, DEPARTMENT OF AGRICULTURE, FOOD, ANF FOR, DEPT OF AGRICULTURE FOOD & FORESTRY]"
1,19000,"[COSMETOLOGY BOARD, ST BD OF COSMETOLOGY AND BARBERING]"
2,21500,"[BOARD OF DENTISTRY, OKLAHOMA ST.BOARD OF DENTISTRY]"
3,26500,"[`DEPARTMENT OF EDUCATION, DEPARTMENT OF EDUCATION]"
4,30900,"[DEPARTMENT OF EMERGENCY MANAGEMEN, DEPARTMENT OF EMERGENCY MANAGEMENT]"
5,34000,"[STATE DEPARTMENT OF HEALTH, OKLAHOMA STATE DEPARTMENT OF HEALTH]"
6,34200,"[BD. OF MEDICOLEGAL INVESTIGATIONS, OFFICE OF THE CHIEF MEDICAL EXAMINER]"
7,50900,"[BD. OF EXAM. FOR LT CARE ADMIN, BD. OF EXAM. FOR LT CARE ADMIN.]"
8,58800,"[OKLA. REAL ESTATE COMM., OKLAHOMA REAL ESTATE COMM.]"
9,60600,"[ARDMORE HIGHER EDUCATION CENTER, UNIVERSITY CENTER OF SOUTHERN OKLAHOMA]"


In [41]:
# Creating mapping table for duplicate AgencyNames
agency_mapping_table = {
    4000: 'DEPARTMENT OF AGRICULTURE FOOD & FORESTRY',
    19000: 'OKLAHOMA STATE BOARD OF COSMETOLOGY AND BARBERING',
    21500: 'OLKAHOMA STATE BOARD OF DENTISTRY',
    26500: 'DEPARTMENT OF EDUCATION',
    30900: 'DEPARTMENT OF EMERGENCY MANAGEMENT',
    34000: 'OKLAHOMA STATE DEPARTMENT OF HEALTH',
    34200: 'OFFICE OF THE CHIEF MEDICAL EXAMINER',
    50900: 'BOARD OF EXAM. FOR LT CARE ADMIN',
    58800: 'OKLAHOMA REAL ESTATE COMMISSION',
    60600: 'UNIVERSITY CENTER OF SOUTHERN OKLAHOMA'
}

# Replacing duplicate AgencyNames with correct AgencyName
df_pcard['AgencyName'] = df_pcard['AgencyNumber'].map(agency_mapping_table).fillna(df_pcard['AgencyName']) # Map the known values, fill the rest with the original values



In [42]:
pd.set_option('display.max_colwidth', None)  # Prevent truncation of long strings

# Check mapping of AgencyNumber to AgencyName
agency_mapping = df_pcard.groupby('AgencyNumber')['AgencyName'].unique()
multiple_agency_names = agency_mapping[agency_mapping.apply(len) > 1]
multiple_agency_names_df = pd.DataFrame(multiple_agency_names).reset_index()
multiple_agency_names_df


Unnamed: 0,AgencyNumber,AgencyName


### Standardizing naming convention of AgencyName

In [43]:
# Function to find and replace common patterns in AgencyName
def standardize_agency_name(agency_name):
    # Add spaces after punctuation, but avoid duplicates
    agency_name = re.sub(r"\.(?!\s)", ". ", agency_name) # Add a space after a period if not already present
    agency_name = re.sub(r"\s+\.", ".", agency_name) # Remove spaces before periods
    
    # Remove extra spaces
    agency_name = re.sub(r'\s+', " ", agency_name).strip()
    
    # Dictionary of patterns to replace
    replacements = {
        r"\b(OK\.?|OKLA\.?|OKLAHOMA\.)\b": 'OKLAHOMA', # Replace variations of OKLAHOMA with OKLAHOMA
        r"\b(DEPT\.?|DEPARTMENT\.)\b": 'DEPARTMENT', # Replace variations of DEPARTMENT with DEPARTMENT
        r"\b(BD\.?|BOARD\.)\b": 'BOARD', # Replace variations of BOARD with BOARD
        r"\b(ST\.?|STATE\.)\b": 'STATE', # Replace variations of STATE with STATE
        r"\b(UNIV\.?|UNIVERSITY\.)\b": 'UNIVERSITY', # Replace variations of UNIVERSITY with UNIVERSITY
        r"\b(COMM\.?|COMMISSION\.)\b": 'COMMISSION' # Replace variations of COMMISSION with COMMISSION
    }

    # Loop to replace words identified in replacements dictionary
    for pattern, replacement in replacements.items():
        agency_name = re.sub(pattern, replacement, agency_name)

    # Ensure standardized formatting for COMMISSION
    if agency_name.startswith('COMMISSION OF '):
        # Move COMMISSION to the end
        agency_name = agency_name.replace('COMMISSION OF ', '') + ' COMMISSION'

    # Remove extra spaces again after replacing patterns
    agency_name = re.sub(r'\s+', ' ', agency_name).strip()

    # Remove trailing periods from adjusted words
    agency_name = re.sub(r'(OKLAHOMA|DEPARTMENT|BOARD|STATE|UNIVERSITY|COMMISSION)\.', r'\1', agency_name)
    
    # Ensure capitalization
    return agency_name.upper()
    
# Apply the standardize_agency_name function to the AgencyName column
df_pcard['AgencyName'] = df_pcard['AgencyName'].apply(standardize_agency_name)


### Fix formatting for MCC, Description, and Vendor

In [44]:
# Add spaces after punctuation and stripping extra spaces

# Function to fix spacing after punctuation
def fix_punctuation_spacing(name):
    # Add a single space after punctuation if not already present except periods
    name = re.sub(r'([,:;!?])(?=\S)', r'\1 ', name) # Add a space after punctuation if not already present except periods
    
    # Remove extra spaces around punctuation
    name = re.sub(r'\s+([.,:;!?])', r'\1', name) # Removes spaces before punctuation
    name = re.sub(r'\s+', ' ', name).strip() # Remove extra spaces

    return name

In [45]:
# Fixes punctuation spacing for Vendor column
df_pcard['Vendor'] = df_pcard['Vendor'].apply(fix_punctuation_spacing)

In [46]:
# Fixes punctuation spacing for MCC column
df_pcard['MCC'] = df_pcard['MCC'].apply(fix_punctuation_spacing)

In [47]:
# Fixes punctuation spacing for Description column
df_pcard['Description'] = df_pcard['Description'].apply(fix_punctuation_spacing)

In [49]:
# Group by 2014 transactions only
by_year = df_pcard[df_pcard['TransactionDate'].dt.year == 2014]
by_year

Unnamed: 0,AgencyNumber,AgencyName,CardholderLastName,CardholderFirstInitial,Description,Amount,Vendor,TransactionDate,PostedDate,MCC
56333,1000,OKLAHOMA STATE UNIVERSITY,Reddington,D,GENERAL PURCHASE,735.00,ABSA,2014-01-03,2014-01-06,SCHOOLS AND EDUCATIONAL SERVICES NOT ELSEWHERE CLASSIFIED
56334,1000,OKLAHOMA STATE UNIVERSITY,Reece,S,GENERAL PURCHASE,304.04,ORSCHELN DURANT 83,2014-01-03,2014-01-06,HARDWARE STORES
56335,1000,OKLAHOMA STATE UNIVERSITY,Reece,S,GENERAL PURCHASE,39.19,ORSCHELN DURANT 83,2014-01-03,2014-01-06,HARDWARE STORES
56336,1000,OKLAHOMA STATE UNIVERSITY,Rendina,E,GENERAL PURCHASE,260.00,CELLSIGNAL.COM,2014-01-02,2014-01-06,"DRUGS, DRUG PROPRIETARIES, AND DRUGGISTS SUNDRIES"
56337,1000,OKLAHOMA STATE UNIVERSITY,Rex,T,GENERAL PURCHASE,80.00,THERMACUBE 00 OF 00,2014-01-02,2014-01-06,BUSINESS SERVICES NOT ELSEWHERE CLASSIFIED
...,...,...,...,...,...,...,...,...,...,...
657151,98000,GRAND RIVER DAM AUTH.,Prince,S,14 DYMDR WKLY PLNR BLK DSK EA|2015 MNTH DAYMNDR PL,626.69,STAPLES,2014-12-02,2014-12-03,"STATIONERY, OFFICE SUPPLIES, PRINTING AND WRITING PAPER"
657152,98000,GRAND RIVER DAM AUTH.,Stroup,P,GDY A40_HY-T PLUS V- BELT PCE|GDY B40_HY-T PLUS V-,3955.04,KIT KAMAN-PRY 483,2014-12-02,2014-12-03,INDUSTRIAL SUPPLIES NOT ELSEWHERE CLASSIFIED
657153,98000,GRAND RIVER DAM AUTH.,Stroup,P,GENERAL PURCHASE,701.05,T & L SUPPLY,2014-12-02,2014-12-03,INDUSTRIAL SUPPLIES NOT ELSEWHERE CLASSIFIED
657154,98000,GRAND RIVER DAM AUTH.,Stroup,P,GENERAL PURCHASE,1395.50,MATHESON-308,2014-12-02,2014-12-03,CHEMICALS AND ALLIED PRODUCTS NOT ELSEWHERE CLASSIFIED
