# Optimizing DataFrames and Processing in Chunks

This project shows how processing data in chunks can be utilized to work with datasets where not enough storage is available on your machine

In [1]:
#reading in first five rows and looking for errors
import pandas as pd
first_5_rows = pd.read_csv("loans_2007.csv", nrows=5)

In [2]:
first_5_rows # examining the first 5 rows for data issues

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,policy_code,application_type,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599.0,5000.0,5000.0,4975.0,36 months,10.65%,162.87,B,B2,...,171.62,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
1,1077430,1314167.0,2500.0,2500.0,2500.0,60 months,15.27%,59.83,C,C4,...,119.66,Sep-2013,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
2,1077175,1313524.0,2400.0,2400.0,2400.0,36 months,15.96%,84.33,C,C5,...,649.91,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
3,1076863,1277178.0,10000.0,10000.0,10000.0,36 months,13.49%,339.31,C,C1,...,357.48,Apr-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0
4,1075358,1311748.0,3000.0,3000.0,3000.0,60 months,12.69%,67.79,B,B5,...,67.79,Jun-2016,0.0,1.0,INDIVIDUAL,0.0,0.0,0.0,0.0,0.0


no apparent issues with the data examining the first 5 rows

In [3]:
first_3250_rows = pd.read_csv("loans_2007.csv", nrows = 3250) 

In [4]:
# calculating the total memory usage of the first 3250 (updated to get close to but not over 5 mbs)
first_3250_rows.memory_usage(deep=True).sum()/(1024*1024) 

np.float64(4.962096214294434)

In [5]:
# creating a chunk object using under 5 mbs
data_chunks = pd.read_csv('loans_2007.csv',chunksize=3250) 

## Determining the numeric and object/string columns in each chunk

In [6]:
for chunk in data_chunks:
    object_cols = chunk.select_dtypes(include=['object']).columns
    object_count = len(object_cols)
    print(object_cols)
    print("\n")
    print(object_count)
    print("\n")  

Index(['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'issue_d', 'loan_status',
       'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state',
       'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d',
       'last_credit_pull_d', 'application_type'],
      dtype='object')


21


Index(['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'issue_d', 'loan_status',
       'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state',
       'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d',
       'last_credit_pull_d', 'application_type'],
      dtype='object')


21


Index(['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'issue_d', 'loan_status',
       'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state',
       'earliest_cr_line', 'rev

In [7]:
# creating a chunk object using under 5 mbs
data_chunks = pd.read_csv('loans_2007.csv',chunksize=3250) 

In [8]:
for chunk in data_chunks:
    numeric_cols = chunk.select_dtypes(include=['number']).columns
    numeric_count = len(numeric_cols)
    print(numeric_cols)
    print("\n")
    print(numeric_count)
    print("\n")                

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
       'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq',
       'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies',
       'tax_liens'],
      dtype='object')


31


Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
       'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pym

 It appears ID moves from numeric to string in a few chunks. I believe a string would make more since so there are a total of 30 numerics and 22 object/strings

## Reviewing string columns for possible categorical conversions

In [9]:
# creating a chunk object using under 5 mbs
data_chunks = pd.read_csv('loans_2007.csv',chunksize=3250) 

In [10]:
uniques = {}

for chunk in data_chunks:
    string_chunk = chunk.select_dtypes(include=['object'])
    total_cols = len(string_chunk.columns)  # Fix column count

    for col in string_chunk.columns:
        total_unique = len(string_chunk[col].unique())
        # Ensure values are stored as a list
        if col in uniques:
            uniques[col].append(total_unique)
        else:
            uniques[col] = [total_unique]  # Initialize as list
            

In [11]:
max_dict = max_dict = {key: max(value) for key, value in uniques.items()}
divided_dict = {key: value / 3250 for key, value in max_dict.items()}

In [12]:
filtered_dict = {key: value for key, value in divided_dict.items() if value < .5}

In [13]:
filtered_dict


{'term': 0.0009230769230769231,
 'int_rate': 0.08492307692307692,
 'grade': 0.0024615384615384616,
 'sub_grade': 0.011076923076923076,
 'emp_length': 0.0036923076923076922,
 'home_ownership': 0.0018461538461538461,
 'verification_status': 0.0012307692307692308,
 'issue_d': 0.01353846153846154,
 'loan_status': 0.0018461538461538461,
 'pymnt_plan': 0.0006153846153846154,
 'purpose': 0.004615384615384616,
 'zip_code': 0.1926153846153846,
 'addr_state': 0.015692307692307693,
 'earliest_cr_line': 0.12553846153846154,
 'revol_util': 0.30030769230769233,
 'initial_list_status': 0.0006153846153846154,
 'last_pymnt_d': 0.029846153846153845,
 'last_credit_pull_d': 0.03292307692307692,
 'application_type': 0.0006153846153846154}

In [14]:
print(len(filtered_dict))

19


19 total total columns where unique values is < 50% of entires.  columns are shown above to convert to category if needed

## Potential float column candidates for inetger conversion types

In [15]:
# creating a chunk object using under 5 mbs
data_chunks = pd.read_csv('loans_2007.csv',chunksize=3250) 

In [16]:
total_nulls = []
for chunk in data_chunks:
    numeric_cols = chunk.select_dtypes(include=['float'])
    total_null_chunk = numeric_cols.isnull().sum()
    total_nulls.append(total_null_chunk)
    
total_nulls = pd.concat(total_nulls)
total_nulls = total_nulls.groupby(total_nulls.index).sum().sort_values()

total_nulls

collection_recovery_fee          3
dti                              3
loan_amnt                        3
member_id                        3
last_pymnt_amnt                  3
installment                      3
funded_amnt_inv                  3
funded_amnt                      3
total_pymnt                      3
total_pymnt_inv                  3
total_rec_int                    3
revol_bal                        3
recoveries                       3
policy_code                      3
out_prncp_inv                    3
out_prncp                        3
total_rec_prncp                  3
total_rec_late_fee               3
annual_inc                       7
open_acc                        32
delinq_amnt                     32
delinq_2yrs                     32
acc_now_delinq                  32
inq_last_6mths                  32
pub_rec                         32
total_acc                       32
tax_liens                      108
collections_12_mths_ex_med     148
chargeoff_within_12_

The series above shows that all float columns have null values although some are very low

## Calculating the total memory usage across all chunks

In [17]:
data_chunks = pd.read_csv('loans_2007.csv',chunksize=3250)

mbs = []

for chunk in data_chunks:
    mbs.append(chunk.memory_usage(deep=True).sum() / 1024 ** 2)

sum(mbs)

np.float64(65.2423849105835)

The total dataset is approximately 65 mbs

In [18]:
object_cols #checking variable created earlier

Index(['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'verification_status', 'issue_d',
       'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code',
       'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status',
       'last_pymnt_d', 'last_credit_pull_d', 'application_type'],
      dtype='object')

## Creating value counts across chunks to understand how to optimize data

In [19]:
# creating a data dictionary with unique values in each column (note this doesn't group the chunks yet)
data_chunks = pd.read_csv('loans_2007.csv',chunksize=3250)

object_values = {}

for chunk in data_chunks:
    for col in object_cols:
        value_counts = chunk[col].value_counts()
        if col in object_values:
            object_values[col].append(value_counts)
        else:
            object_values[col]=[value_counts]
    

In [20]:
# Combining the chunks of value counts
combined_object_values = {}

for col in object_values:
    combined = pd.concat(object_values[col])
    final = combined.groupby(combined.index).sum()
    combined_object_values[col] = final
    

In [21]:
combined_object_values

{'id': id
 54734                                              1
 55742                                              1
 57245                                              1
 57416                                              1
 58915                                              1
                                                   ..
 99982                                              1
 99987                                              1
 Loans that do not meet the credit policy           1
 Total amount funded in policy code 1: 471701350    1
 Total amount funded in policy code 2: 0            1
 Name: count, Length: 42538, dtype: int64,
 'term': term
 36 months    31534
 60 months    11001
 Name: count, dtype: int64,
 'int_rate': int_rate
  5.42%    573
  5.79%    410
  5.99%    347
  6.00%     19
  6.03%    447
          ... 
 23.59%      4
 23.91%     11
 24.11%      3
 24.40%      1
 24.59%      1
 Name: count, Length: 394, dtype: int64,
 'grade': grade
 A    10183
 B    12389
 C 

## Converting category and string to numeric (if applicable) datatypes

In [22]:
# creating list of columns to convert to numeric and category
convert_to_num_cols = ["revol_util", "int_rate"]
convert_to_category_cols = ["grade", "sub_grade", "term", "home_ownership", "verification_status", "loan_status", "pymnt_plan", "purpose" ,"addr_state"]

In [23]:
# converts string variables to numerics (per lists above) and appropriate variables to categories and sums each chunk to determine the difference in size
new_mbs = []
data_chunks = pd.read_csv('loans_2007.csv',chunksize=3250)
for chunk in data_chunks:
    for col in convert_to_num_cols:
        cleaned = chunk[col].str.rstrip("%")
        chunk[col] = pd.to_numeric(cleaned)   
    for col in convert_to_category_cols:
        chunk[col] = chunk[col].astype('category')
    numeric = chunk.select_dtypes(include=['float']).columns # this obtains numeric columns for another operation so the dataset doesn't have to be reloaded
        
    new_mbs.append(chunk.memory_usage(deep=True).sum() / 1024 ** 2)

In [24]:
chunk.dtypes # confirming types updated on the last chunk in the iteration

id                              object
member_id                      float64
loan_amnt                      float64
funded_amnt                    float64
funded_amnt_inv                float64
term                          category
int_rate                       float64
installment                    float64
grade                         category
sub_grade                     category
emp_title                       object
emp_length                      object
home_ownership                category
annual_inc                     float64
verification_status           category
issue_d                         object
loan_status                   category
pymnt_plan                    category
purpose                       category
title                           object
zip_code                        object
addr_state                    category
dti                            float64
delinq_2yrs                    float64
earliest_cr_line                object
inq_last_6mths           

In [25]:
print("Total mbs with no datatypes changed vs total with categories and strings to numerics updated")
print(sum(mbs))
print(sum(new_mbs))

Total mbs with no datatypes changed vs total with categories and strings to numerics updated
65.2423849105835
38.07561779022217


## Calculating null values across numerics

In [26]:
numeric

Index(['member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate',
       'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq',
       'chargeoff_within_12_mths', 'delinq_amnt', 'pub_rec_bankruptcies',
       'tax_liens'],
      dtype='object')

In [36]:
# summing null values across each chunk and grouping
data_chunks = pd.read_csv('loans_2007.csv',chunksize=3250)
null_totals = []
for chunk in data_chunks:
    for col in numeric:
        null_totals.append(pd.Series(chunk[col].isnull().sum(), index = [chunk[col].name]))
    combined = pd.concat(null_totals).groupby(level=0).sum()

In [37]:
combined

acc_now_delinq                  32
annual_inc                       7
chargeoff_within_12_mths       148
collection_recovery_fee          3
collections_12_mths_ex_med     148
delinq_2yrs                     32
delinq_amnt                     32
dti                              3
funded_amnt                      3
funded_amnt_inv                  3
inq_last_6mths                  32
installment                      3
int_rate                         3
last_pymnt_amnt                  3
loan_amnt                        3
member_id                        3
open_acc                        32
out_prncp                        3
out_prncp_inv                    3
policy_code                      3
pub_rec                         32
pub_rec_bankruptcies          1368
recoveries                       3
revol_bal                        3
revol_util                      93
tax_liens                      108
total_acc                       32
total_pymnt                      3
total_pymnt_inv     

It doesn't appear there are any numeric columns with no null values or a significant amount of null values to make any optimizations

Additional ideas to save space are reviewing the spread in the numeric columns to see if a more efficient datatype can be used (int or lower float) dataquest also mentioned creating functions to automate the process more