### Patch

In [1]:
from pandas.io.sql import SQLTable

def _execute_insert(self, conn, keys, data_iter):
    #print("Using monkey-patched _execute_insert")
    data = [dict((k, v) for k, v in zip(keys, row)) for row in data_iter]
    conn.execute(self.insert_statement().values(data))

SQLTable._execute_insert = _execute_insert

In [2]:
import pandas as pd
import json
import pymysql.cursors
import os
import time
import shutil

from sqlalchemy import create_engine

In [3]:
# MySQL Parameters
DB_HOST = 'localhost'
DB_USER = 'root'
DB_PASS = 'ceg2ececeg2ece'
DB_NAME = 'hmda'
DB_SECURE_FILE_PRIV = 'D:/sqldata/Uploads/'
# Files location
folder = "F:/data/HMDA/"

In [4]:
# Create column format dictionary
coldf = pd.read_csv(folder + 'HMDA_columns_1990_2004.csv')
cols = json.loads(coldf.to_json(orient='records'))

In [5]:
cols

[{'LENGTH': 4, 'NAME': 'ASOF_DATE', 'START': 0, 'STOP': 4},
 {'LENGTH': 10, 'NAME': 'RESP_ID', 'START': 4, 'STOP': 14},
 {'LENGTH': 1, 'NAME': 'AGENCY_CODE', 'START': 14, 'STOP': 15},
 {'LENGTH': 1, 'NAME': 'LOAN_TYPE', 'START': 15, 'STOP': 16},
 {'LENGTH': 1, 'NAME': 'LOAN_PURPOSE', 'START': 16, 'STOP': 17},
 {'LENGTH': 1, 'NAME': 'OCCUPANCY', 'START': 17, 'STOP': 18},
 {'LENGTH': 5, 'NAME': 'LOAN_AMOUNT', 'START': 18, 'STOP': 23},
 {'LENGTH': 1, 'NAME': 'ACTION_TYPE', 'START': 23, 'STOP': 24},
 {'LENGTH': 4, 'NAME': 'PROPERTY_MSA', 'START': 24, 'STOP': 28},
 {'LENGTH': 2, 'NAME': 'STATE_CODE', 'START': 28, 'STOP': 30},
 {'LENGTH': 3, 'NAME': 'COUNTY_CODE', 'START': 30, 'STOP': 33},
 {'LENGTH': 7, 'NAME': 'CENSUS_TRACT_NUMBER', 'START': 33, 'STOP': 40},
 {'LENGTH': 1, 'NAME': 'RACE_APPLICANT', 'START': 40, 'STOP': 41},
 {'LENGTH': 1, 'NAME': 'RACE_COAPPLICANT', 'START': 41, 'STOP': 42},
 {'LENGTH': 1, 'NAME': 'SEX_APPLICANT', 'START': 42, 'STOP': 43},
 {'LENGTH': 1, 'NAME': 'SEX_COAPP

In [6]:
#datarange = [x for x in range(1990, 2001, 1)]
datarange = [x for x in range(1990, 2015, 1)]
datarange

[1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014]

In [7]:

def UploadDatatoDB(df):
    # Connect to the database
    connection = pymysql.connect(host=DB_HOST,
                                 user=DB_USER,
                                 password=DB_PASS,
                                 db=DB_NAME,
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)

#    engine = create_engine('mysql+pymysql://root:@localhost:3306/hmda', echo = False)
    
    #df.to_sql(name = 'my_table', con = engine, if_exists = 'append', index = False)
    
    #df.to_sql(name='lars', con=connection, if_exists = 'append', index=False)

    try:
        sql = "INSERT INTO lars ("
        for i in cols:
            sql = sql + i["NAME"] + ", "

        sql = sql[:-2]
        sql = sql + ") VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

        for i in df.iterrows():

            # create tuple of data
            data = [ i[1][x['NAME']].strip() for x in cols]
            data = tuple(data) 

            with connection.cursor() as cursor:
                result_row = cursor.execute(sql, data)
                connection.commit()

            if i[0] % 10000 == 0:
                print(i[0])

        with connection.cursor() as cursor:
            # Read a single record
            sql = "SELECT * FROM lars LIMIT 1"
            cursor.execute(sql)
            result = cursor.fetchone()
            print(result)
    finally:
        connection.close()
        
    return "Upload ok"


In [8]:
def readDataFile(filename):
    
    df = pd.read_csv(filename, header=None, delimiter="\t", names=["original"])

    for col in cols:
        #print(col['NAME'])
        df[col['NAME']] = df.original.map(lambda x: x[col["START"]:col["STOP"]])

    df = df.drop(['original'], axis=1)
    
    
    return df


In [67]:
def verifyData(df):
    
    """
    Verifies every column to ensure proper data formatting.
    """

    # --- no need for length check now, only replace '\\'
    
#     # TODO: drop indices from errordf
    
#     errordf = pd.DataFrame([], columns = df.columns)
#   #  cleandf = pd.DataFrame([], columns = df.columns)
    
#     for i in cols:
#         errordf = pd.concat([errordf, df[df[i['NAME']].str.len() > i['LENGTH']]])
#    #     cleandf = pd.concat([cleandf, df[df[i['NAME']].str.len() <= i['LENGTH']]])
    
#     print("Errors")
#     print(errordf)

    cleandf = df.replace('\\', '|')
    #cleandf2 = cleandf.replace('\r', '0')
    cleandf.loc[cleandf['SEQUENCE_NUMBER'] == '', 'SEQUENCE_NUMBER'] = 0
    
    return cleandf
    


def manualUpload(filename):
    connection = pymysql.connect(host=DB_HOST,
                                 user=DB_USER,
                                 password=DB_PASS,
                                 db=DB_NAME,
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)
    
    # To comply with the --secure-file-priv flag, first move file to designated
    # upload folder.
    newfile = DB_SECURE_FILE_PRIV + os.path.basename(filename)
    shutil.copy2(filename, newfile)

    
    sql = " LOAD DATA INFILE '" + newfile +  "' \
            INTO TABLE hmda.lars \
            FIELDS TERMINATED BY ',' \
            ENCLOSED BY \'\"\'    \
            LINES TERMINATED BY \'\\n\'   \
            IGNORE 1 ROWS           \
            (ASOF_DATE,RESP_ID,AGENCY_CODE,LOAN_TYPE,LOAN_PURPOSE,OCCUPANCY,  \
             LOAN_AMOUNT,ACTION_TYPE,PROPERTY_MSA,STATE_CODE,COUNTY_CODE,     \
             CENSUS_TRACT_NUMBER,RACE_APPLICANT,RACE_COAPPLICANT,SEX_APPLICANT,  \
             SEX_COAPPLICANT,APPLICANT_INCOME,PURCHASER_TYPE,DENIAL_REASON_1,   \
             DENIAL_REASON_2,DENIAL_REASON_3,EDIT_STATUS,SEQUENCE_NUMBER);"
    
    #print(sql)
    try:

        with connection.cursor() as cursor:
            result_row = cursor.execute(sql)
            connection.commit()

        with connection.cursor() as cursor:
            # Read a single record
            sql = "SELECT count(*) FROM lars"
            cursor.execute(sql)
            result = cursor.fetchone()
            print(result)
    finally:
        connection.close()


# Check number of lines per file

In [26]:
for i in datarange:
    
    totalstart = time.time()
    print(i)
    filename = folder + "HMS.U" + str(i) + ".LARS"
    
    # Specific file names
    if i == 2001:
        filename = filename + ".PUBLIC.DATA"

    if i == 2004:
        filename = folder + "u2004lar.public.dat"

    if i >= 2005:
        
        files = {   2005:"LARS.FINAL.2005.DAT",
                    2006:"LARS.FINAL.2006.DAT",
                    2007:"lars.ultimate.2007.dat", 
                    2008:"lars.ultimate.2008.dat", 
                    2009:"2009_Ultimate_PUBLIC_LAR.dat",
                    2010:"Lars.ultimate.2010.dat",
                    2011:"Lars.ultimate.2011.dat",
                    2012:"Lars.ultimate.2012.dat",
                    2013:"Lars.ultimate.2013.dat",
                    2014:"Lars_ultimate_2014.txt"}
        filename = folder + files[i]
        

    num_lines = sum(1 for line in open(filename))
    print(i, ":", num_lines)

1990
1990 : 6707650
1991
1991 : 7940024
1992
1992 : 12026809
1993
1993 : 15477323
1994
1994 : 12215807
1995
1995 : 11271664
1996
1996 : 14865058
1997
1997 : 16421046
1998
1998 : 24701216
1999
1999 : 22911425
2000
2000 : 19250597
2001
2001 : 27643163
2002
2002 : 31310410
2003
2003 : 41579149
2004
2004 : 33630474
2005
2005 : 36439157
2006
2006 : 34105441
2007
2007 : 26702092
2008
2008 : 17531240
2009
2009 : 19574492
2010
2010 : 16751980
2011
2011 : 14906446
2012
2012 : 18752061
2013
2013 : 17013337
2014
2014 : 12086507


# Read file, parse, save, upload, delete

In [27]:
# check for --secure_file_priv flag:

connection = pymysql.connect(host=DB_HOST,
                             user=DB_USER,
                             password=DB_PASS,
                             db=DB_NAME,
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)
try:

    with connection.cursor() as cursor:
        # Read a single record
        sql = 'SHOW VARIABLES LIKE "secure_file_priv";'
        cursor.execute(sql)
        result = cursor.fetchone()
        print(result)
finally:
    connection.close()

{'Value': 'D:\\sqldata\\Uploads\\', 'Variable_name': 'secure_file_priv'}


In [None]:
for i in datarange:
    
    totalstart = time.time()
    print(i)
    filename = folder + "HMS.U" + str(i) + ".LARS"
    
    # Specific file names
    if i == 2001:
        filename = filename + ".PUBLIC.DATA"

    if i == 2004:
        filename = folder + "u2004lar.public.dat"

    if i >= 2005:
        
        files = {   2005:"LARS.FINAL.2005.DAT",
                    2006:"LARS.FINAL.2006.DAT",
                    2007:"lars.ultimate.2007.dat", 
                    2008:"lars.ultimate.2008.dat", 
                    2009:"2009_Ultimate_PUBLIC_LAR.dat",
                    2010:"Lars.ultimate.2010.dat",
                    2011:"Lars.ultimate.2011.dat",
                    2012:"Lars.ultimate.2012.dat",
                    2013:"Lars.ultimate.2013.dat",
                    2014:"Lars_ultimate_2014.txt"}
        filename = folder + files[i]
        
        
    # Read and parse
    print("Reading and parsing file...", end='')
    parsestart = time.time()
    df = readDataFile(filename)
    print("done.")
    print("Total lines:")
    print(len(df.index))
    parsedone = time.time()
    elapsed = parsedone - parsestart
    print("Parse:", str(elapsed), " sec")
    
    # verify data
    print("Cleaning data...")
    cleanstart = time.time()
    
    df = verifyData(df)
    cleandone = time.time()
    elapsed = cleandone - cleanstart
    print("Data cleaned.")
    print("Clean:", str(elapsed), " sec")

    
    
    # save
    print("Saving file...", end='')
    df.to_csv(filename + '.csv', index=False)
    print("done.")
    
    
    # upload
    print("Uploading file...", end='')
    uploadstart = time.time()
    
    manualUpload(filename + '.csv')
    uploaddone = time.time()
    elapsed = uploaddone - uploadstart
    print("done.")
    print("Upload:", str(elapsed), " sec")
    
    # remove file
    #os.remove(filename + '.csv')
    
    totaldone = time.time()
    elapsed = totaldone - totalstart
    print("Total:", str(elapsed), " sec")
    

1990
Reading and parsing file...done.
Total lines:
6707650
Parse: 49.02228569984436  sec
Cleaning data...
Data cleaned.
Clean: 8.993188619613647  sec
Saving file...done.
Uploading file...{'count(*)': 6707650}
done.
Upload: 230.5758502483368  sec
Total: 313.8776857852936  sec
1991
Reading and parsing file...done.
Total lines:
7940024
Parse: 57.38112783432007  sec
Cleaning data...
Data cleaned.
Clean: 11.031574487686157  sec
Saving file...done.
Uploading file...{'count(*)': 14647674}
done.
Upload: 391.32878613471985  sec
Total: 489.5847282409668  sec
1992
Reading and parsing file...done.
Total lines:
12026809
Parse: 88.34206986427307  sec
Cleaning data...
Data cleaned.
Clean: 17.642287015914917  sec
Saving file...done.
Uploading file...{'count(*)': 26674483}
done.
Upload: 663.5623817443848  sec
Total: 814.8168108463287  sec
1993
Reading and parsing file...done.
Total lines:
15477323
Parse: 113.01582551002502  sec
Cleaning data...
Data cleaned.
Clean: 24.015493392944336  sec
Saving file..

In [14]:
df[df['RACE_APPLICANT'] == '\\']

Unnamed: 0,ASOF_DATE,RESP_ID,AGENCY_CODE,LOAN_TYPE,LOAN_PURPOSE,OCCUPANCY,LOAN_AMOUNT,ACTION_TYPE,PROPERTY_MSA,STATE_CODE,...,RACE_COAPPLICANT,SEX_APPLICANT,SEX_COAPPLICANT,APPLICANT_INCOME,PURCHASER_TYPE,DENIAL_REASON_1,DENIAL_REASON_2,DENIAL_REASON_3,EDIT_STATUS,SEQUENCE_NUMBER
240714,1990,0000001338,1,1,1,1,00085,6,,09,...,8,4,4,,0,,,,,1040
243109,1990,0000001338,1,1,1,1,00030,6,,25,...,8,4,4,,8,,,,,3435
244211,1990,0000001338,1,1,1,1,00100,6,,25,...,8,4,4,,8,,,,,4537
244419,1990,0000001338,1,1,1,1,00179,6,,25,...,8,4,4,,8,,,,,4745
245252,1990,0000001338,1,1,1,1,00114,6,,25,...,8,4,4,,8,,,,,5578
245434,1990,0000001338,1,1,1,1,00030,6,,25,...,8,4,4,,8,,,,,5760
245797,1990,0000001338,1,1,1,1,00014,6,,25,...,8,4,4,,8,,,,,6123
248524,1990,0000001338,1,1,1,1,00030,6,,25,...,8,4,4,,8,,,,,8850
250079,1990,0000001338,1,2,1,1,00023,6,,25,...,8,4,4,,8,,,,,10405
240714,1990,0000001338,1,1,1,1,00085,6,,09,...,8,4,4,,0,,,,,1040


# Connect to MySQL

### Parse file

In [26]:
df.head()

Unnamed: 0,original,ASOF_DATE,RESP_ID,AGENCY_CODE,LOAN_TYPE,LOAN_PURPOSE,OCCUPANCY,LOAN_AMOUNT,ACTION_TYPE,PROPERTY_MSA,...,RACE_COAPPLICANT,SEX_APPLICANT,SEX_COAPPLICANT,APPLICANT_INCOME,PURCHASER_TYPE,DENIAL_REASON_1,DENIAL_REASON_2,DENIAL_REASON_3,EDIT_STATUS,SEQUENCE_NUMBER
0,19900000035301B121000101NA NANA NA 582400...,1990,35301,B,1,2,1,10,1,,...,8,2,4,39,0,,,,,1
1,19900000035301B121000041NA NANA NA 581400...,1990,35301,B,1,2,1,4,1,,...,8,1,4,32,0,,,,,2
2,19900000035301B121000081NA NANA NA 582400...,1990,35301,B,1,2,1,8,1,,...,8,2,4,35,0,,,,,3
3,19900000035301B121000051NA NANA NA 581400...,1990,35301,B,1,2,1,5,1,,...,8,1,4,30,0,,,,,4
4,19900000035301B121000111NA NANA NA 551201...,1990,35301,B,1,2,1,11,1,,...,5,1,2,130,0,,,,,5


In [28]:
df2 = df.drop(['original'], axis=1)

In [29]:
df2.head()

Unnamed: 0,ASOF_DATE,RESP_ID,AGENCY_CODE,LOAN_TYPE,LOAN_PURPOSE,OCCUPANCY,LOAN_AMOUNT,ACTION_TYPE,PROPERTY_MSA,STATE_CODE,...,RACE_COAPPLICANT,SEX_APPLICANT,SEX_COAPPLICANT,APPLICANT_INCOME,PURCHASER_TYPE,DENIAL_REASON_1,DENIAL_REASON_2,DENIAL_REASON_3,EDIT_STATUS,SEQUENCE_NUMBER
0,1990,35301,B,1,2,1,10,1,,,...,8,2,4,39,0,,,,,1
1,1990,35301,B,1,2,1,4,1,,,...,8,1,4,32,0,,,,,2
2,1990,35301,B,1,2,1,8,1,,,...,8,2,4,35,0,,,,,3
3,1990,35301,B,1,2,1,5,1,,,...,8,1,4,30,0,,,,,4
4,1990,35301,B,1,2,1,11,1,,,...,5,1,2,130,0,,,,,5


In [49]:
len(df.index)

7940024

In [9]:
import MySQLdb


In [10]:
df = df.drop(['original'], axis=1)

In [16]:
from tqdm import tqdm

def chunker(seq, size):
    # from http://stackoverflow.com/a/434328
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def insert_with_progress(df, engine):
    chunksize = int(len(df) / 1000) # 10k chunks seems to be largest
    with tqdm(total=len(df)) as pbar:
        for i, cdf in enumerate(chunker(df, chunksize)):
            replace = "replace" if i == 0 else "append"
            
            cdf.to_sql(name='lars', con=engine, if_exists = 'append', index=False)
            
            pbar.update(chunksize)

# ---
engine = create_engine('mysql+mysqldb://root:@localhost:3306/hmda', echo = False)

insert_with_progress(df, engine)

 39%|███████████████████████████▎                                          | 2615730/6707650 [11:34<18:00, 3787.70it/s]Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x0000026D100AB840>
Traceback (most recent call last):
  File "C:\Users\Matt\AppData\Local\conda\conda\envs\lsci-gpu\lib\weakref.py", line 357, in remove
    self = selfref()
KeyboardInterrupt
 45%|███████████████████████████████▊                                      | 3044978/6707650 [13:30<15:58, 3822.80it/s]


KeyboardInterrupt: 

In [17]:
df.to_csv(filename + '.csv', index=False)

### Upload file directly

In [22]:
ndf = pd.DataFrame([], columns = df.columns)

In [30]:
df[df['SEX_COAPPLICANT'].str.len() > 0]

Unnamed: 0,ASOF_DATE,RESP_ID,AGENCY_CODE,LOAN_TYPE,LOAN_PURPOSE,OCCUPANCY,LOAN_AMOUNT,ACTION_TYPE,PROPERTY_MSA,STATE_CODE,...,RACE_COAPPLICANT,SEX_APPLICANT,SEX_COAPPLICANT,APPLICANT_INCOME,PURCHASER_TYPE,DENIAL_REASON_1,DENIAL_REASON_2,DENIAL_REASON_3,EDIT_STATUS,SEQUENCE_NUMBER
0,1990,0000035301,B,1,2,1,00010,1,,,...,8,2,4,0039,0,,,,,1
1,1990,0000035301,B,1,2,1,00004,1,,,...,8,1,4,0032,0,,,,,2
2,1990,0000035301,B,1,2,1,00008,1,,,...,8,2,4,0035,0,,,,,3
3,1990,0000035301,B,1,2,1,00005,1,,,...,8,1,4,0030,0,,,,,4
4,1990,0000035301,B,1,2,1,00011,1,,,...,5,1,2,0130,0,,,,,5
5,1990,0000035301,B,1,2,1,00084,1,,,...,5,1,2,0250,0,,,,,6
6,1990,0000035301,B,1,2,1,00005,1,,25,...,8,1,4,0039,0,,,,,7
7,1990,0000035301,B,1,2,1,00001,1,,25,...,8,1,4,0035,0,,,,,8
8,1990,0000035301,B,1,2,1,00006,3,1120,25,...,5,2,1,0024,0,3,,,,9
9,1990,0000035301,B,1,2,1,00030,1,1120,25,...,5,1,2,0082,0,,,,,10


In [16]:
errordf = pd.DataFrame([], columns = df.columns)
cleandf = pd.DataFrame([], columns = df.columns)

errordf = df[df['RACE_APPLICANT'].str.len() > 1]
#cleandf = df[df['RACE_APPLICANT'].str.len() <= 1]


print("Errors")
print(errordf)

    

MemoryError: 

In [29]:
df = df.replace('\\', '|')

In [32]:
for n,i in df.loc[1025734].items():
    print(n, i, len(i))

ASOF_DATE 1990 4
RESP_ID 0000015080 10
AGENCY_CODE 1 1
LOAN_TYPE 1 1
LOAN_PURPOSE 2 1
OCCUPANCY 1 1
LOAN_AMOUNT 00045 5
ACTION_TYPE 1 1
PROPERTY_MSA NA   4
STATE_CODE NA 2
COUNTY_CODE  0
CENSUS_TRACT_NUMBER  0
RACE_APPLICANT  0
RACE_COAPPLICANT  0
SEX_APPLICANT  0
SEX_COAPPLICANT  0
APPLICANT_INCOME  0
PURCHASER_TYPE  0
DENIAL_REASON_1  0
DENIAL_REASON_2  0
DENIAL_REASON_3  0
EDIT_STATUS  0
SEQUENCE_NUMBER  0


In [33]:
df.loc[1025734]

ASOF_DATE                    1990
RESP_ID                0000015080
AGENCY_CODE                     1
LOAN_TYPE                       1
LOAN_PURPOSE                    2
OCCUPANCY                       1
LOAN_AMOUNT                 00045
ACTION_TYPE                     1
PROPERTY_MSA                 NA  
STATE_CODE                     NA
COUNTY_CODE                      
CENSUS_TRACT_NUMBER              
RACE_APPLICANT                   
RACE_COAPPLICANT                 
SEX_APPLICANT                    
SEX_COAPPLICANT                  
APPLICANT_INCOME                 
PURCHASER_TYPE                   
DENIAL_REASON_1                  
DENIAL_REASON_2                  
DENIAL_REASON_3                  
EDIT_STATUS                      
SEQUENCE_NUMBER                  
Name: 1025734, dtype: object

In [40]:
df.loc[df['SEQUENCE_NUMBER'] == '', 'SEQUENCE_NUMBER']

1025734    
1025735    
1025736    
1287040    
2197287    
2199711    
2203975    
2211015    
2213216    
2235183    
2235529    
2250298    
2262021    
2276945    
2279648    
2282130    
2571885    
2572284    
2572719    
2572845    
2573300    
2579585    
2579594    
Name: SEQUENCE_NUMBER, dtype: object

In [43]:
df

Unnamed: 0,ASOF_DATE,RESP_ID,AGENCY_CODE,LOAN_TYPE,LOAN_PURPOSE,OCCUPANCY,LOAN_AMOUNT,ACTION_TYPE,PROPERTY_MSA,STATE_CODE,...,RACE_COAPPLICANT,SEX_APPLICANT,SEX_COAPPLICANT,APPLICANT_INCOME,PURCHASER_TYPE,DENIAL_REASON_1,DENIAL_REASON_2,DENIAL_REASON_3,EDIT_STATUS,SEQUENCE_NUMBER
0,1990,0000035301,B,1,2,1,00010,1,,,...,8,2,4,0039,0,,,,,1
1,1990,0000035301,B,1,2,1,00004,1,,,...,8,1,4,0032,0,,,,,2
2,1990,0000035301,B,1,2,1,00008,1,,,...,8,2,4,0035,0,,,,,3
3,1990,0000035301,B,1,2,1,00005,1,,,...,8,1,4,0030,0,,,,,4
4,1990,0000035301,B,1,2,1,00011,1,,,...,5,1,2,0130,0,,,,,5
5,1990,0000035301,B,1,2,1,00084,1,,,...,5,1,2,0250,0,,,,,6
6,1990,0000035301,B,1,2,1,00005,1,,25,...,8,1,4,0039,0,,,,,7
7,1990,0000035301,B,1,2,1,00001,1,,25,...,8,1,4,0035,0,,,,,8
8,1990,0000035301,B,1,2,1,00006,3,1120,25,...,5,2,1,0024,0,3,,,,9
9,1990,0000035301,B,1,2,1,00030,1,1120,25,...,5,1,2,0082,0,,,,,10


In [65]:
cleandf = df.copy()
cleandf.replace('\\', '|', inplace=True)
display(cleandf)
cleandf.loc[cleandf['SEQUENCE_NUMBER'] == '', 'SEQUENCE_NUMBER'] = 0
cleandf

Unnamed: 0,ASOF_DATE,RESP_ID,AGENCY_CODE,LOAN_TYPE,LOAN_PURPOSE,OCCUPANCY,LOAN_AMOUNT,ACTION_TYPE,PROPERTY_MSA,STATE_CODE,...,RACE_COAPPLICANT,SEX_APPLICANT,SEX_COAPPLICANT,APPLICANT_INCOME,PURCHASER_TYPE,DENIAL_REASON_1,DENIAL_REASON_2,DENIAL_REASON_3,EDIT_STATUS,SEQUENCE_NUMBER
0,1990,0000035301,B,1,2,1,00010,1,,,...,8,2,4,0039,0,,,,,1
1,1990,0000035301,B,1,2,1,00004,1,,,...,8,1,4,0032,0,,,,,2
2,1990,0000035301,B,1,2,1,00008,1,,,...,8,2,4,0035,0,,,,,3
3,1990,0000035301,B,1,2,1,00005,1,,,...,8,1,4,0030,0,,,,,4
4,1990,0000035301,B,1,2,1,00011,1,,,...,5,1,2,0130,0,,,,,5
5,1990,0000035301,B,1,2,1,00084,1,,,...,5,1,2,0250,0,,,,,6
6,1990,0000035301,B,1,2,1,00005,1,,25,...,8,1,4,0039,0,,,,,7
7,1990,0000035301,B,1,2,1,00001,1,,25,...,8,1,4,0035,0,,,,,8
8,1990,0000035301,B,1,2,1,00006,3,1120,25,...,5,2,1,0024,0,3,,,,9
9,1990,0000035301,B,1,2,1,00030,1,1120,25,...,5,1,2,0082,0,,,,,10


Unnamed: 0,ASOF_DATE,RESP_ID,AGENCY_CODE,LOAN_TYPE,LOAN_PURPOSE,OCCUPANCY,LOAN_AMOUNT,ACTION_TYPE,PROPERTY_MSA,STATE_CODE,...,RACE_COAPPLICANT,SEX_APPLICANT,SEX_COAPPLICANT,APPLICANT_INCOME,PURCHASER_TYPE,DENIAL_REASON_1,DENIAL_REASON_2,DENIAL_REASON_3,EDIT_STATUS,SEQUENCE_NUMBER
0,1990,0000035301,B,1,2,1,00010,1,,,...,8,2,4,0039,0,,,,,1
1,1990,0000035301,B,1,2,1,00004,1,,,...,8,1,4,0032,0,,,,,2
2,1990,0000035301,B,1,2,1,00008,1,,,...,8,2,4,0035,0,,,,,3
3,1990,0000035301,B,1,2,1,00005,1,,,...,8,1,4,0030,0,,,,,4
4,1990,0000035301,B,1,2,1,00011,1,,,...,5,1,2,0130,0,,,,,5
5,1990,0000035301,B,1,2,1,00084,1,,,...,5,1,2,0250,0,,,,,6
6,1990,0000035301,B,1,2,1,00005,1,,25,...,8,1,4,0039,0,,,,,7
7,1990,0000035301,B,1,2,1,00001,1,,25,...,8,1,4,0035,0,,,,,8
8,1990,0000035301,B,1,2,1,00006,3,1120,25,...,5,2,1,0024,0,3,,,,9
9,1990,0000035301,B,1,2,1,00030,1,1120,25,...,5,1,2,0082,0,,,,,10


In [55]:
cleandf.iloc[cleandf['SEQUENCE_NUMBER'] == '', 'SEQUENCE_NUMBER']

AttributeError: 'int' object has no attribute 'iloc'

In [66]:
cleandf[cleandf['SEQUENCE_NUMBER'] == '']

Unnamed: 0,ASOF_DATE,RESP_ID,AGENCY_CODE,LOAN_TYPE,LOAN_PURPOSE,OCCUPANCY,LOAN_AMOUNT,ACTION_TYPE,PROPERTY_MSA,STATE_CODE,...,RACE_COAPPLICANT,SEX_APPLICANT,SEX_COAPPLICANT,APPLICANT_INCOME,PURCHASER_TYPE,DENIAL_REASON_1,DENIAL_REASON_2,DENIAL_REASON_3,EDIT_STATUS,SEQUENCE_NUMBER
