# Practice Optimizing Dataframes and Processing in Chunks

We'll begin by analyzing the loan data and determining the memory usage from the rows of the dataset. We want to find what max number of rows would keep memory usage under 5 megabytes.

In [133]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 99

In [134]:
first_five = pd.read_csv('loans_2007.csv', nrows=5)
print(first_five)

        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  1077501  1296599.0     5000.0       5000.0           4975.0   36 months   
1  1077430  1314167.0     2500.0       2500.0           2500.0   60 months   
2  1077175  1313524.0     2400.0       2400.0           2400.0   36 months   
3  1076863  1277178.0    10000.0      10000.0          10000.0   36 months   
4  1075358  1311748.0     3000.0       3000.0           3000.0   60 months   

  int_rate  installment grade sub_grade                 emp_title emp_length  \
0   10.65%       162.87     B        B2                       NaN  10+ years   
1   15.27%        59.83     C        C4                     Ryder   < 1 year   
2   15.96%        84.33     C        C5                       NaN  10+ years   
3   13.49%       339.31     C        C1       AIR RESOURCES BOARD  10+ years   
4   12.69%        67.79     B        B5  University Medical Group     1 year   

  home_ownership  annual_inc verification_status  

In [135]:
first_thousand = pd.read_csv('loans_2007.csv', nrows=3100)
first_thousand.memory_usage(deep=True).sum()/(1024*1024)

4.804134368896484

## Exploring the Data in Chunks

For each chunk:
How many columns have a numeric type? How many have a string type?
How many unique values are there in each string column? How many of the string columns contain values that are less than 50% unique?
Which float columns have no missing values and could be candidates for conversion to the integer type?
Calculate the total memory usage across all of the chunks.


In [136]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3100)
for chunk in chunk_iter:
    print(chunk.memory_usage(deep=True).sum()/(1024*1024))    

4.804134368896484
4.79917049407959
4.80189323425293
4.8021345138549805
4.798679351806641
4.800777435302734
4.800665855407715
4.801664352416992
4.79929256439209
4.804698944091797
4.808723449707031
4.817438125610352
4.995901107788086
3.67452335357666


In [137]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3100)
total_rows = 0
for chunk in chunk_iter:
    total_rows += len(chunk)
print(total_rows)

42538


In [138]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3100)

numeric = []
string = []
for chunk in chunk_iter:
    nums = chunk.select_dtypes(include=[np.number]).shape[1]
    numeric.append(nums)
    strs = chunk.select_dtypes(include=['object']).shape[1]
    string.append(strs)

print(numeric)
print(string)

[31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30]
[21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22]


In [139]:
# Are string columns consistent across chunks?
obj_cols = []
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3100)

for chunk in chunk_iter:
    chunk_obj_cols = chunk.select_dtypes(include=['object']).columns.tolist()
    if len(obj_cols) > 0:
        is_same = obj_cols == chunk_obj_cols
        if not is_same:
            print("overall obj cols:", obj_cols, "\n")
            print("chunk obj cols:", chunk_obj_cols, "\n")    
    else:
        obj_cols = chunk_obj_cols

overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] 

chunk obj cols: ['id', 'term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] 

overall obj cols: ['term', 'int_rate', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'purpose', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'revol_util', 'initial_list_status', 'last_pymnt_d', 'last_credit_pull_d', 'application_type'] 



In [140]:
##How many unique values are there in each string column? How many of the string columns contain values that are less than 50% unique?
chunk_iter = pd.read_csv('loans_2007.csv',chunksize=3100)

uniques = {}
for chunk in chunk_iter:
    strings_only = chunk.select_dtypes(include=['object'])
    cols = strings_only.columns
    for c in cols:
        val_counts = strings_only[c].value_counts()
        if c in uniques:
            uniques[c].append(val_counts)
        else:
            uniques[c] = [val_counts]

uniques_combined = {}
unique_stats = {
    'column_name': [],
    'total_values': [],
    'unique_values': [],
}

useful_obj_cols = []

for col in uniques:
    u_concat = pd.concat(uniques[col])
    u_group = u_concat.groupby(u_concat.index).sum()
    uniques_combined[col] = u_group
    if (u_group.shape[0]/42538) < 0.2:
        useful_obj_cols.append(col)
        print(col, u_group.shape[0])

home_ownership 5
grade 7
initial_list_status 1
issue_d 55
application_type 1
revol_util 1119
pymnt_plan 2
emp_length 11
id 5338
purpose 14
loan_status 9
int_rate 394
term 2
last_credit_pull_d 108
zip_code 837
sub_grade 35
earliest_cr_line 530
last_pymnt_d 103
verification_status 3
addr_state 50


In [141]:
loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3100)

missing = []
for lc in loans_chunks:
    floats = lc.select_dtypes(include=['float'])
    missing.append(floats.apply(pd.isnull).sum())

combined_missing = pd.concat(missing)
combined_missing.groupby(combined_missing.index).sum().sort_values()

member_id                        3
total_rec_int                    3
total_pymnt_inv                  3
total_pymnt                      3
revol_bal                        3
recoveries                       3
policy_code                      3
out_prncp_inv                    3
out_prncp                        3
total_rec_late_fee               3
loan_amnt                        3
last_pymnt_amnt                  3
total_rec_prncp                  3
funded_amnt_inv                  3
funded_amnt                      3
dti                              3
collection_recovery_fee          3
installment                      3
annual_inc                       7
inq_last_6mths                  32
total_acc                       32
delinq_2yrs                     32
pub_rec                         32
delinq_amnt                     32
open_acc                        32
acc_now_delinq                  32
tax_liens                      108
collections_12_mths_ex_med     148
chargeoff_within_12_

In [142]:
loans_chunks = pd.read_csv('loans_2007.csv',chunksize=3100)

mem_usage = []

for lc in loans_chunks:
    mem_usage.append(lc.memory_usage(deep=True).sum() / 1024 ** 2)

sum(mem_usage)

66.30969715118408

## Optimizing String Columns

In [143]:
useful_obj_cols

['home_ownership',
 'grade',
 'initial_list_status',
 'issue_d',
 'application_type',
 'revol_util',
 'pymnt_plan',
 'emp_length',
 'id',
 'purpose',
 'loan_status',
 'int_rate',
 'term',
 'last_credit_pull_d',
 'zip_code',
 'sub_grade',
 'earliest_cr_line',
 'last_pymnt_d',
 'verification_status',
 'addr_state']

In [144]:

## Create dictionary (key: column, value: list of Series objects representing each chunk's value counts)
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3100)
str_cols_vc = {}
for chunk in chunk_iter:
    str_cols = chunk.select_dtypes(include=['object'])
    for col in str_cols.columns:
        current_col_vc = str_cols[col].value_counts()
        if col in str_cols_vc:
            str_cols_vc[col].append(current_col_vc)
        else:
            str_cols_vc[col] = [current_col_vc]

In [145]:
## Combine the value counts.
combined_vcs = {}

for col in str_cols_vc:
    combined_vc = pd.concat(str_cols_vc[col])
    final_vc = combined_vc.groupby(combined_vc.index).sum()
    combined_vcs[col] = final_vc

In [146]:
for col in useful_obj_cols:
    print(col)
    print(combined_vcs[col])
    print("-----------")

home_ownership
MORTGAGE    18959
NONE            8
OTHER         136
OWN          3251
RENT        20181
Name: home_ownership, dtype: int64
-----------
grade
A    10183
B    12389
C     8740
D     6016
E     3394
F     1301
G      512
Name: grade, dtype: int64
-----------
initial_list_status
f    42535
Name: initial_list_status, dtype: int64
-----------
issue_d
Apr-2008     259
Apr-2009     333
Apr-2010     912
Apr-2011    1563
Aug-2007      74
Aug-2008     100
Aug-2009     446
Aug-2010    1175
Aug-2011    1934
Dec-2007     172
Dec-2008     253
Dec-2009     658
Dec-2010    1335
Dec-2011    2267
Feb-2008     306
Feb-2009     302
Feb-2010     682
Feb-2011    1298
Jan-2008     305
Jan-2009     269
Jan-2010     662
Jan-2011    1380
Jul-2007      63
Jul-2008     141
Jul-2009     411
Jul-2010    1204
Jul-2011    1875
Jun-2007      24
Jun-2008     124
Jun-2009     406
Jun-2010    1105
Jun-2011    1835
Mar-2008     402
Mar-2009     324
Mar-2010     828
Mar-2011    1448
May-2008     115
May-200

**Convert relevant columns to 'category'**

Let's convert the following columns to category:
* grade
* sub_grade
* home_ownership
* verification_status
* purpose
* initial_list_status
* application_type
* pymnt_plan
* emp_length
* id
* loan_status
* addr_state

Let's convert the following to numeric:
* term
* revol_util
* int_rate
* zip_code

Let's convert the following to datetime:
* issue_d
* last_credit_pull_d
* earliest_cr_line
* last_pyment_d


In [147]:
convert_col_dtypes = {
    "sub_grade": "category", "home_ownership": "category", 
    "verification_status": "category", "purpose": "category",
    "grade": "category", "initial_list_status": "category",
"application_type": "category", "pymnt_plan": "category",
"emp_length": "category", "id": "category", "loan_status": "category",
"addr_state": "category"}




In [154]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3100, dtype=convert_col_dtypes, 
                         parse_dates=["issue_d", "earliest_cr_line", "last_pymnt_d", "last_credit_pull_d"])
new_mem_usage = []

for chunk in chunk_iter:
    #Clean data to make it numeric
    term_cleaned = chunk['term'].str.lstrip(" ").str.rstrip(" months")
    revol_cleaned = chunk['revol_util'].str.rstrip("%")
    int_rate_cleaned = chunk['int_rate'].str.rstrip("%")
    zip_code_cleaned = chunk['zip_code'].str.rstrip("xx")
    
    #Convert data to numeric type
    chunk['term'] = pd.to_numeric(term_cleaned)
    chunk['revol_util'] = pd.to_numeric(revol_cleaned)
    chunk['int_rate'] = pd.to_numeric(int_rate_cleaned)
    chunk['zip_code'] = pd.to_numeric(zip_code_cleaned)
    #Check new memory
    new_mem_usage.append(chunk.memory_usage(deep=True).sum()/ 1024 ** 2)


**Let's compare the total memory usage**

In [155]:
print("Old Memory Usage: ", sum(mem_usage))
print("New Memory Usage: ", sum(new_mem_usage))


Old Memory Usage:  66.30969715118408
New Memory Usage:  22.63568115234375


In [157]:
chunk_iter = pd.read_csv('loans_2007.csv', chunksize=3100, dtype=convert_col_dtypes, 
                         parse_dates=["issue_d", "earliest_cr_line", "last_pymnt_d", "last_credit_pull_d"])
missing = []

for chunk in chunk_iter:
    #Clean data to make it numeric
    term_cleaned = chunk['term'].str.lstrip(" ").str.rstrip(" months")
    revol_cleaned = chunk['revol_util'].str.rstrip("%")
    int_rate_cleaned = chunk['int_rate'].str.rstrip("%")
    zip_code_cleaned = chunk['zip_code'].str.rstrip("xx")
    
    #Convert data to numeric type
    chunk['term'] = pd.to_numeric(term_cleaned)
    chunk['revol_util'] = pd.to_numeric(revol_cleaned)
    chunk['int_rate'] = pd.to_numeric(int_rate_cleaned)
    chunk['zip_code'] = pd.to_numeric(zip_code_cleaned)
    floats = chunk.select_dtypes(include=['float'])
    missing.append(floats.apply(pd.isnull).sum())

combined_missing = pd.concat(missing)
combined_missing.groupby(combined_missing.index).sum().sort_values()

zip_code                         3
term                             3
policy_code                      3
out_prncp_inv                    3
out_prncp                        3
total_rec_prncp                  3
member_id                        3
loan_amnt                        3
last_pymnt_amnt                  3
int_rate                         3
revol_bal                        3
installment                      3
funded_amnt_inv                  3
funded_amnt                      3
dti                              3
total_pymnt                      3
total_pymnt_inv                  3
total_rec_int                    3
collection_recovery_fee          3
total_rec_late_fee               3
recoveries                       3
annual_inc                       7
total_acc                       32
acc_now_delinq                  32
pub_rec                         32
inq_last_6mths                  32
delinq_amnt                     32
delinq_2yrs                     32
open_acc            