In [1]:
import numpy as np
import pandas as pd
import string
import re
from fuzzywuzzy import fuzz 
from nltk.metrics import * 



<h2>Reading in the datasets</h2>

In [2]:
dna = pd.read_csv("../data/working/validcompaniesdictionary.csv", index_col = [0])

In [3]:
fda = pd.read_excel("../data/original/fda_companies.xlsx")

In [47]:
ndc = pd.read_excel("../data/original/BI DSPG Company Datasets/NDC_Company_Dataset.xls")

<h2>Neil's code for cleaning</h2>

In [5]:
removeset=string.punctuation
removeset=removeset.replace("-","") #Don't remove dashes
removeset=removeset.replace("&","") #Don't remove ampersand
removeset=removeset.replace("_","") #Don't remove underscore
removeset=removeset.replace("%","") #Don't remove percent
removeset=removeset.replace("$","") #Don't remove dollar
   
print(removeset)

!"#'()*+,./:;<=>?@[\]^`{|}~


In [183]:
# remove all single characters (This step is done first, because later there are single chars we want to retain.)
#document = re.sub(r'\s+[a-zA-Z]\s+', ' ', str(X[sen]))
string = "hello i world"
string = re.sub(r'\s+[a-zA-Z]\s+', ' ', string)
print(string)

# remove all numbers
#document = re.sub(r'[0-9]','', document)
string2 = "h3ll0"
string2 = re.sub(r'[0-9]','', string)
print(string2)

# Substituting multiple spaces with single space
#document = re.sub(r'\s+', ' ', document, flags=re.I)
string3 = "hello      world"
string3 = re.sub(r'\s+', ' ', string3, flags=re.I)
print(string3)

#Converting to lowercase
string4 = "HEllo WORLD"
string4 = string4.lower()
print(string4)

#Removing prefixed 'b'
#document = re.sub(r'^b\s+', '', document)
string5 = "b hello world"
string5 = re.sub(r'^b\s+', '', string5)
print(string5)

#Make dashes into combined words
#document = re.sub(r'\s-\s+', '-', document)
string6 = "hello - world"
string6 = re.sub(r'\s-\s+', '-', string6)
print(string6)

#Make ampersand into combined words
#document = re.sub(r'\s&\s+', '&', document)
string7 = "hel & lo & world"
string7 = re.sub(r'\s&\s+', '&', string7)
print(string7)

#Make underscore into combined words
#document = re.sub(r'\s_\s+', '_', document)
string8 = "hel _ lo wo _ rld"
string8 = re.sub(r'\s_\s+', '_', string8)
print(string8)

hello world
hello world
hello world
hello world
hello world
hello-world
hel&lo&world
hel_lo wo_rld


In [77]:
#removes all punctuation in string that is in removeset
document = "Johnson+;Johnson!"
for i in removeset:
    document=re.sub(re.escape(i),"",document)
print(document)

JohnsonJohnson


<h2>Cleaning NDC</h2>

In [48]:
ndc['originalRow'] = np.arange(0,len(ndc))

<h4>Removing the first 25 since they are just numbers</h4>

In [49]:
#Getting rid of the first 25 since those are just numbers
ndc = ndc.iloc[25:]
#renaming column
ndc = ndc.rename(columns = {'Row Labels':'company'})

In [50]:
ndc['row'] = np.arange(0,len(ndc))
ndc.set_index('row', inplace = True)

In [51]:
ndc.head()

Unnamed: 0_level_0,company,originalRow
row,Unnamed: 1_level_1,Unnamed: 2_level_1
0,SPIRONOLACTONE 2%,25
1,-L'Oreal USA Products Inc,26
2,.Cardinal Health,27
3,.Church & Dwight Canada Corp,28
4,{Preferred Pharmaeutials INC.,29


<h4>Lowercase everything</h4>

In [52]:
#Converting to lower first
ndc.company = ndc.company.str.lower()

<h4>Remove content that are in parentheses</h4>

In [53]:
#Function for removing parentheses content
def removeParenthesesContent(string):
    return re.sub(r'\([^)]*\)', '', string)


In [54]:
ndc['companiesWithoutParensContent'] = ndc['company'].apply(removeParenthesesContent)

In [55]:
del ndc['company']

<h4>Remove Unwanted Punctuation</h4>

In [56]:
#function that gets rid of unwanted punctuation
#This does get rid of ' within a string (ex. l'oreal becomes l oreal) so maybe recheck?
def removeUnwantedPunc(string):
    return re.sub('[!"#\'()*+,./:;%<=>?@[\]^`{|}~]', '', string)

In [57]:
ndc['companiesWithoutUnwantedPunc'] = ndc['companiesWithoutParensContent'].apply(removeUnwantedPunc)

In [58]:
del ndc['companiesWithoutParensContent']

In [59]:
ndc.rename(columns = {"companiesWithoutUnwantedPunc": "company"}, inplace = True)

In [60]:
#At this point all the companies are lowercased, don't have parenthetical content, and don't have unwanted punctuation
ndc.head()

Unnamed: 0_level_0,originalRow,company
row,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25,spironolactone 2
1,26,-loreal usa products inc
2,27,cardinal health
3,28,church & dwight canada corp
4,29,preferred pharmaeutials inc


<h4>Removing numbers</h4>

In [61]:
#Function that will remove numbers
def removeNumbers(string):
    return re.sub(r'[0-9+]','', string)

In [62]:
ndc['companiesWithNoNumbers'] = ndc.company.apply(removeNumbers)

In [63]:
del ndc['company']

In [64]:
ndc = ndc.rename(columns = {"companiesWithNoNumbers": 'company'})

In [65]:
ndc.head()

Unnamed: 0_level_0,originalRow,company
row,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25,spironolactone
1,26,-loreal usa products inc
2,27,cardinal health
3,28,church & dwight canada corp
4,29,preferred pharmaeutials inc


<h4>Removing Single Chars</h4>

In [66]:
def removeSingleChar(string):
    return re.sub(r'\s+[a-zA-Z]\s+', ' ', string)

In [67]:
ndc['companiesWithNoSingleChar'] = ndc.company.apply(removeSingleChar)

In [68]:
del ndc['company']
ndc = ndc.rename(columns={"companiesWithNoSingleChar": 'company'})

<h4>Substituting Multiple Spaces with Single Space</h4>

In [69]:
def subMultipleSpacesForOne(string):
    return re.sub(r'\s+', ' ', string, flags=re.I)

In [70]:
ndc['noMultipleSpaces'] = ndc.company.apply(subMultipleSpacesForOne)

In [71]:
del ndc['company']
ndc = ndc.rename(columns = {"noMultipleSpaces": 'company'})

<h4>Removing Prefix b</h4>

In [72]:
def removePrefix(string):
    return re.sub(r'^b\s+', '', string)

In [73]:
ndc['removedPrefix'] = ndc.company.apply(removePrefix)

In [74]:
ndc.columns

Index(['originalRow', 'company', 'removedPrefix'], dtype='object')

In [75]:
del ndc['company']
ndc = ndc.rename(columns = {"removedPrefix":"company"})

<h4>Make dashes into combined words</h4>

In [76]:
#start here
def makeDashCombined(string):
    return re.sub(r'\s-\s+', '-', string)

In [77]:
ndc['combinedDash'] = ndc.company.apply(makeDashCombined)


In [78]:
del ndc['company']
ndc = ndc.rename(columns = {"combinedDash": 'company'})

<h4>Combine Ampersand</h4>

In [79]:
def combineAmpersand(string):
    return re.sub(r'\s&\s+', '&', string)

In [80]:
ndc['combineAmp'] = ndc.company.apply(combineAmpersand)

In [81]:
ndc.columns

Index(['originalRow', 'company', 'combineAmp'], dtype='object')

In [82]:
del ndc['company']
ndc = ndc.rename(columns = {"combineAmp": 'company'})

In [83]:
ndc.columns

Index(['originalRow', 'company'], dtype='object')

<h4>Combine '_'</h4>

In [84]:
def combinedUnderScore(string):
    return re.sub(r'\s_\s+', '_', string)

In [85]:
ndc.columns

Index(['originalRow', 'company'], dtype='object')

In [86]:
ndc['combinedUnder'] = ndc.company.apply(combinedUnderScore)

In [87]:
del ndc['company']
ndc = ndc.rename(columns = {"combinedUnder": 'company'})

In [88]:
ndc.head()

Unnamed: 0_level_0,originalRow,company
row,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25,spironolactone
1,26,-loreal usa products inc
2,27,cardinal health
3,28,church&dwight canada corp
4,29,preferred pharmaeutials inc


<h2>End Initial Cleaning</h2>

<h4>Grabbing the list of legal entities from os github</h4>

In [89]:
legalEntities = pd.read_csv("https://raw.githubusercontent.com/DSPG-Young-Scholars-Program/dspg20oss/danBranch/ossPy/keyFiles/curatedLegalEntitesRaw.csv", quotechar = "'",header = None)
legalEntities.head()

Unnamed: 0,0
0,(?i) Inc\b
1,(?i) Ltd\b
2,(?i) LLC\b
3,(?i) GmbH\b
4,(?i) Corporation\b


<h4>Using Daniel's code to get remove legal entities</h4>

In [90]:
def eraseFromColumn(inputColumn, eraseList):
    "iteratively delete regex query matches from input list"
    
    """
    inputColumn -- a column from a pandas dataframe, this will be the set of
    target words/entries that deletions will be made from
    eraseList -- a column containing strings (regex expressions) which will be
    deleted from the inputColumn, in an iterative fashion
    """
    eraseList['changeNum'] = 0
    eraseList['changeIndexes'] = ''
    
    inputColumn = inputColumn.replace(regex=True, to_replace = "\\\\", value='/')
    
    for index, row in eraseList.iterrows():
        curReplaceVal = row[0]
        currentRegexExpression=re.compile(curReplaceVal)
        CurrentBoolVec=inputColumn.str.contains(currentRegexExpression, na= False)
        eraseList['changeIndexes'].iloc[index]=[i for i, x in enumerate(CurrentBoolVec) if x]
        eraseList['changeNum'].iloc[index] = len(eraseList['changeIndexes'].iloc[index])
        inputColumn.replace(regex=True, to_replace=currentRegexExpression,value='', inplace = True)
    
    return inputColumn, eraseList

In [91]:
output, eraseList = eraseFromColumn(ndc.company, legalEntities)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [92]:
#This dataframe is lowercased, doesn't have any of the unwanted punctuation, and has removed all legal entity labels
output= pd.DataFrame(output)

<h4>Counting Unique String Tokens using Daniel's code, then adding more to the list from Isabel and Susweta's lists</h4>

In [93]:
ndc_cleaned = output

In [94]:
longString = ndc_cleaned['company'].str.cat(sep = " ")

In [95]:
longStringSeperated = longString.split(' ')

In [96]:
uniqueSubTokenFrame = pd.DataFrame(longStringSeperated)

In [97]:
columnUniqueCounts = uniqueSubTokenFrame.iloc[:,0].value_counts()

In [98]:
tableUniqueCounts = columnUniqueCounts.reset_index()

In [99]:
tableUniqueCounts.rename(columns = {0: "count", "index":"token"}, inplace = True)

In [100]:
top20 = tableUniqueCounts.head(20).token.tolist()

In [101]:
#top 20 tokens in ndc
top20

['pharmaceuticals',
 'medical',
 'products',
 'laboratories',
 'pharma',
 'anda',
 'supply',
 'health',
 'pharmaceutical',
 'usa',
 'international',
 'care',
 'and',
 'nda',
 'coltd',
 'the',
 'home',
 'healthcare',
 '',
 'of']

In [102]:
#top occuring tokens from fda and dna
top20.append("group")
top20.append("holdings")
top20.append("capital")
top20.append("technologies")
top20.append("association")
top20.append('us')
top20.append('services')
top20.append("university")
top20.append("bank")
top20.append("partners")
top20.append("energy")
top20.append("systems")
top20.append("intl")
top20.append("pharms")
top20.append("american")
top20.append("national")
top20.append("biosciences")

In [103]:
#Top 20 occuring tokens in ndc plus some top occuring tokens in dna and fda
top20

['pharmaceuticals',
 'medical',
 'products',
 'laboratories',
 'pharma',
 'anda',
 'supply',
 'health',
 'pharmaceutical',
 'usa',
 'international',
 'care',
 'and',
 'nda',
 'coltd',
 'the',
 'home',
 'healthcare',
 '',
 'of',
 'group',
 'holdings',
 'capital',
 'technologies',
 'association',
 'us',
 'services',
 'university',
 'bank',
 'partners',
 'energy',
 'systems',
 'intl',
 'pharms',
 'american',
 'national',
 'biosciences']

<h4>Getting rid of top occuring tokens</h4>

In [104]:
ndc_cleaned = ndc_cleaned['company'].apply(lambda x: ' '.join([word for word in x.split() if word not in (top20)])) #Isabel's code

In [105]:
ndc_cleaned = pd.DataFrame(ndc_cleaned)
ndc_cleaned.head()

Unnamed: 0_level_0,company
row,Unnamed: 1_level_1
0,spironolactone
1,-loreal
2,cardinal
3,church&dwight canada
4,preferred pharmaeutials


<h4>Adding csv to working data</h4>

In [106]:
#match these datasets together

In [107]:
og = pd.read_excel("../data/original/BI DSPG Company Datasets/NDC_Company_Dataset.xls")

In [108]:
og['originalRow'] = np.arange(0,len(og))

In [110]:
og = og.iloc[25:]

In [111]:
og['row'] = np.arange(0, len(og))
og.set_index('row', inplace = True)

In [112]:
og['cleaned_name'] = ndc_cleaned.company.tolist()

In [114]:
og.tail(20)

Unnamed: 0_level_0,Row Labels,originalRow,cleaned_name
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7006,"Zinka, Inc.",7031,zinka
7007,ZION HEALTH,7032,zion
7008,"Zivon Cosmetic Co., Ltd.",7033,zivon cosmetic
7009,"ZO Skin Health, Inc.",7034,zo skin
7010,Zoe Processing,7035,zoe processing
7011,"Zogics, LLC",7036,zogics
7012,ZOOM PRODUCTS LLC,7037,zoom
7013,Zoono USA,7038,zoono
7014,"ZOONO USA, LLC",7039,zoono
7015,"ZRG DETOX, INC.",7040,zrg detox


In [115]:
og = og.rename(columns = {"Row Labels": "original_company"})

In [116]:
og.tail()

Unnamed: 0_level_0,original_company,originalRow,cleaned_name
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7021,ZYGONE,7046,zygone
7022,Zyla Life Sciences US Inc.,7047,zyla life sciences
7023,#NAME?,7048,name
7024,(blank),7049,
7025,Grand Total,7050,grand total


In [117]:
len(og)

7026

In [118]:
og.drop([7023], inplace = True)

In [119]:
og.drop([7024], inplace= True)

In [120]:
og.drop([7025], inplace = True)

In [124]:
og.head()

Unnamed: 0_level_0,original_company,originalRow,cleaned_name
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,SPIRONOLACTONE 2%,25,spironolactone
1,-L'Oreal USA Products Inc,26,-loreal
2,.Cardinal Health,27,cardinal
3,.Church & Dwight Canada Corp,28,church&dwight canada
4,{Preferred Pharmaeutials INC.,29,preferred pharmaeutials


Row Labels     SPIRONOLACTONE 2% 
Name: 25, dtype: object

In [126]:
og.to_csv("../data/working/ndc_clean.csv")

In [127]:
x = pd.read_csv("../data/working/ndc_clean.csv", index_col = [0])

In [128]:
x.tail()

Unnamed: 0_level_0,original_company,originalRow,cleaned_name
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7018,Zydus Pharmaceuticals USA Inc,7043,zydus
7019,Zydus Pharmaceuticals USA Inc.,7044,zydus
7020,Zydus Technologies Limited,7045,zydus
7021,ZYGONE,7046,zygone
7022,Zyla Life Sciences US Inc.,7047,zyla life sciences


In [129]:
og[og['original_company'].str.contains("Biosciences")]

Unnamed: 0_level_0,original_company,originalRow,cleaned_name
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
407,Amneal Biosciences,432,amneal
408,Amneal Biosciences LLC,433,amneal
2745,"Greenwich Biosciences, Inc.",2770,greenwich
2875,"Harmony Biosciences, LLC",2900,harmony
3684,"Leadiant Biosciences, Inc.",3709,leadiant
4481,"Neurocrine Biosciences, Inc.",4506,neurocrine
4768,Owen Biosciences Inc,4793,owen
4769,Owen Biosciences Inc.,4794,owen
4770,"Owen Biosciences, Inc.",4795,owen


In [130]:
orig =  pd.read_excel("../data/original/BI DSPG Company Datasets/NDC_Company_Dataset.xls")

In [131]:
orig.iloc[726]

Row Labels    Arbor Pharmaceuticals
Name: 726, dtype: object