In [1]:
import pandas as pd
import numpy as np
import re
import string

# Read CSV To Dataframe

In [18]:
df = pd.read_csv('Data/library_json_title_database_use.csv', sep='\t')
df.head()


Unnamed: 0,titleId,workTitle,subjects,publicationDate,notes,thumbnail,holdings
0,2,"Komplex, Archetypus, Symbol in der Psychologie...","Jung, C. G. (Carl Gustav), 1875-1961, Psychoan...",1959,General Note: Includes index.,https://files.lib.byu.edu/search/thumbnails/ca...,BF 174 .J85 J2936
1,5242885,Hexen- und Dämonenglaube im Lande Braunschweig...,"Witchcraft -- Germany -- Braunschweig Region, ...",©1997,,,BF 1583 .L44 1997
2,5505056,Qualitative strategies for ethnocultural resea...,"Psychology -- Qualitative research, Ethnology ...",©2012,Choice Review: This is an important book edite...,https://www.syndetics.com/index.php?isbn=97814...,BF 76.5 .Q355 2012
3,2228271,"art of persuading people,","Selling, Psychology, Applied",1938,"General Note: ""First edition.""",,BF 636 .W6
4,5505076,Trauma and organizations /,"Associations, institutions, etc, Stress (Psych...",2012,Summary: This collection of new contributions ...,https://www.syndetics.com/index.php?isbn=97818...,BF 175.5 .P75 T73x 2012


## Drop all columns excpet 'subjects' and 'holdings', Rename columns

In [19]:
df = df.drop(['titleId', 'workTitle', 'publicationDate', 'notes', 'thumbnail'], axis=1)
df = df.rename(columns={'subjects':'Subject_Headers', 'holdings':'Call_Number'})
df.head()

Unnamed: 0,Subject_Headers,Call_Number
0,"Jung, C. G. (Carl Gustav), 1875-1961, Psychoan...",BF 174 .J85 J2936
1,"Witchcraft -- Germany -- Braunschweig Region, ...",BF 1583 .L44 1997
2,"Psychology -- Qualitative research, Ethnology ...",BF 76.5 .Q355 2012
3,"Selling, Psychology, Applied",BF 636 .W6
4,"Associations, institutions, etc, Stress (Psych...",BF 175.5 .P75 T73x 2012


14587


## Optional: Split on first subject header.

In [8]:
for i, row in df.iterrows():
    subj = row['Subject_Headers']

    res = re.split(', |-- ', subj)
    print(res[0])
    
    df.at[i, 'Subject_Headers'] = res[0]

Jung
Witchcraft 
Psychology 
Selling
Associations
Freud
Infant psychology
Underachievers
Fludd
Behavioral Electronics
Temperament in children 
Phrenology
Genetic psychology
Personality tests 
Cognitive psychology 
Social perception in children
Expectation (Psychology)
Cognition
Developmental psychology
Conditioned response
Intellect 
None
Psychological tests
Success
Authority
Psychoanalysis
Witchcraft 
Brainwashing
Human behavior
Psychoanalysis and philosophy
Trials (Witchcraft) 
Social skills in children 
Electronic behavior control
Freud
Psychoanalysis 
Psychology
Developmental psychology
Maturation (Psychology) 
Ability
Personality
Intellect
Short-term memory
Factor analysis
Self
Cognitive styles
Clairvoyance
Creative ability 
Children 
Self-actualization (Psychology)
Consciousness
Psychology 
Comprehension 
Freud
Decision making
Christian life
Motor learning
Aerobic exercises 
Cognition
Memory
Psychopharmacology 
Clarke
Clinical child psychology 
Interpersonal communication
Verbal 

In [9]:
df.head()

Unnamed: 0,Subject_Headers,Call_Number
0,Jung,BF 174 .J85 J2936
1,Witchcraft,BF 1583 .L44 1997
2,Psychology,BF 76.5 .Q355 2012
3,Selling,BF 636 .W6
4,Associations,BF 175.5 .P75 T73x 2012


## Clean up data (Remove punctuation, standardize capitals, etc.)

In [10]:
# Remove all punctuation from Subject Headers and lower case all
df['Subject_Headers'] = df['Subject_Headers'].str.replace('[{}]'.format(string.punctuation), '')
df['Subject_Headers'] = df['Subject_Headers'].str.lower()
df.head()

  df['Subject_Headers'] = df['Subject_Headers'].str.replace('[{}]'.format(string.punctuation), '')


Unnamed: 0,Subject_Headers,Call_Number
0,jung,BF 174 .J85 J2936
1,witchcraft,BF 1583 .L44 1997
2,psychology,BF 76.5 .Q355 2012
3,selling,BF 636 .W6
4,associations,BF 175.5 .P75 T73x 2012


In [11]:
# Drop any entries where call number = none
df = df.drop(df[df.Call_Number == "None"].index)

In [12]:
# Remove call numbers that are not BF's
remove_indicies = []

for i, row in df.iterrows():

    curr_num = row['Call_Number']
    #print(curr_num)

    if curr_num[0] + curr_num[1] != "BF":
        print("Non-BF Call Number found, removing from dataset")
        print(curr_num)
        remove_indicies.append(i)
    else: # Fix spacing of BF without space for regex standardization
        if curr_num[2] != " ":
            df.at[i, 'Call_Number'] = curr_num[:2] + " " + curr_num[2:]

df = df.drop(remove_indicies)

Non-BF Call Number found, removing from dataset
HD 38 .S386 1981
Non-BF Call Number found, removing from dataset
B 105 .R25 C44
Non-BF Call Number found, removing from dataset
170 Sm44L 1887
Non-BF Call Number found, removing from dataset
BD 236 .I42 1995
Non-BF Call Number found, removing from dataset
RB 127 .P85 1986
Non-BF Call Number found, removing from dataset
ND 1495 .P8 S95 2003
Non-BF Call Number found, removing from dataset
AC 1 .G72 1982 vol.53
Non-BF Call Number found, removing from dataset
HQ 1064 .U5 A643 1985
Non-BF Call Number found, removing from dataset
C197do
Non-BF Call Number found, removing from dataset
RA565 .E47
Non-BF Call Number found, removing from dataset
BX 8645.6 .T86 2005
Non-BF Call Number found, removing from dataset
AC 1 .G72 1982 vol.53
Non-BF Call Number found, removing from dataset
BL 1135 .P7 A22 1972 vol.1 pt.1
Non-BF Call Number found, removing from dataset
QL 750 .T562x
Non-BF Call Number found, removing from dataset
AC 1 .G72 1982 vol.53
Non-BF

In [13]:
# Ensure Cutter number is spaced 
for i, row in df.iterrows():
    curr_num = row['Call_Number']

    bf_split = curr_num.split(' ')[1]

    check_split = bf_split.split('.')

    # If len <= 1, ignore this entry
    if len(check_split) > 1:
        # Rewrite this entry to separate cutter number from this entry
        if check_split[1][0].isalpha():
            print("Found entry to change")
            print(curr_num)

            new_call = ""
            for j, split in enumerate(curr_num.split(' ')):
                if j == 0:
                    new_call += split + ' '
                elif j != 1:
                    new_call += ' ' + split
                else:
                    new_call += check_split[0] + ' .' + check_split[1]

            df.at[i, 'Call_Number'] = new_call


Found entry to change
BF 720.V57 H34
Found entry to change
BF 720.S63 I53
Found entry to change
BF 720.M68 E84 1992
Found entry to change
BF 109.F74 G76 1991
Found entry to change
BF 108.R8 B3
Found entry to change
BF 720.S48 R64
Found entry to change
BF 108.S65 K68 1984
Found entry to change
BF 789.D4 A58x
Found entry to change
BF 720.P37 E67
Found entry to change
BF 463.S64 K6x 1987
Found entry to change
BF 720.S63 S625
Found entry to change
BF 789.D4 S33 1986
Found entry to change
BF 720.C63 I53x
Found entry to change
BF 463.S64 G6713 1992
Found entry to change
BF 789.D5 S31x 1992
Found entry to change
BF 720.V57 B76
Found entry to change
BF 789.S6 A83 1982
Found entry to change
BF 456.R2 I54
Found entry to change
BF 720.M45 I54 1984
Found entry to change
BF 720.A92 N53 1985
Found entry to change
BF 456.R2 O77x
Found entry to change
BF 789.F5 S34 1975
Found entry to change
BF 456.R2 G63 v.1
Found entry to change
BF 789.S8 B37x
Found entry to change
BF 456.R2 P41981
Found entry to ch

In [14]:
# Rewrite Call_Numbers to be just numeric value up to cutter number
for i, row in df.iterrows():
    curr_num = row['Call_Number']
    print(curr_num)

df.head()

BF 174 .J85 J2936
BF 1583 .L44 1997
BF 76.5 .Q355 2012
BF 636 .W6
BF 175.5 .P75 T73x 2012
BF 121 .N94 1991
BF 723 .I6 W5
BF 723 .P365 R56x 1986
BF 1598 .F58 H83 1988
BF 210 .S72x 1972
BF 723 .T53 K345 2004
BF 870 .F7x 1847
BF 711 .D4
BF 698.5 .P38 2004
BF 201 .U88 2004
BF 723 .S6 D86x 1988b
BF 323 .E8 R43 2004
BF 311 .A93
BF 713 .M332 2005
BF 319 .P68
BF 431 .L45x 1985
BF 176 .F86 1986
BF 637 .S8 T518 1986
BF 18.02 .H84 1990
BF 175 .B87x 2003
BF 1583 .W65 1995
BF 633 .M4
BF 632.5 .C37
BF 175 .B67x 1985
BF 1576 .H633x 1996
BF 723 .S62 J25 1983 vol.1
BF 210 .U846x 1993
BF 173 .F85 W8x 1985
BF 173 .H45x 1985
BF 121 .C58 1974
BF 713 .S84 1986
BF 710 .K53 1988
BF 431 .A579 1993
BF 698 .P47x
BF 431 .G244 1983
BF 378 .S54 B33 1986
BF 697 .H373x 1998
BF 311 .R535x 1998
BF 1325 .B48
BF 408 .G33 1993
BF 432 .C48 S28 1990
BF 637 .S4 S29x 1988
BF 311 .N487x 1992
BF 113 .S35x 1959
BF 325 .P68 1992
BF 173 .P7756 1992
BF 448 .P56 1993
BF 639 .G6
BF 295 .D75
BF 311 .M433 1984
BF 371 .K533 1984
BF 207 

Unnamed: 0,Subject_Headers,Call_Number
0,jung,BF 174 .J85 J2936
1,witchcraft,BF 1583 .L44 1997
2,psychology,BF 76.5 .Q355 2012
3,selling,BF 636 .W6
4,associations,BF 175.5 .P75 T73x 2012


In [15]:
# Rewrite Call_Numbers to be just numeric value up to cutter number
for i, row in df.iterrows():
    curr_num = row['Call_Number']

    df.at[i, 'Call_Number'] = curr_num.split(' ')[1]

df.head()


Unnamed: 0,Subject_Headers,Call_Number
0,jung,174.0
1,witchcraft,1583.0
2,psychology,76.5
3,selling,636.0
4,associations,175.5


## Write output to CSV File for Training

In [16]:
df = df.rename(columns={'Subject_Headers':'Title', 'Call_Number':'Call_Number'})
df.head()

Unnamed: 0,Title,Call_Number
0,jung,174.0
1,witchcraft,1583.0
2,psychology,76.5
3,selling,636.0
4,associations,175.5


In [17]:
df.to_csv('Data/BF_Single_Subject_Clean_Call_Number.csv', sep='\t')