In [1]:
import numpy as np
import pandas as pd
import json
import gzip
from collections import Counter 

In [2]:
df = pd.read_json('../data/original/DNA_DATA_FULL.gz', compression='gzip')

<h2><b>Getting only the columns that deal with the company codes</b></h2>

In [3]:
#Looking only at the company columns
companies = df[['company_codes', 'company_codes_occur', 'company_codes_about', 'company_codes_lineage', 'company_codes_association', 'company_codes_relevance']]

In [4]:
#There are no values in this column so it will not be part of the validating process
print(companies['company_codes_association'].value_counts())
companies = df[['company_codes', 'company_codes_occur', 'company_codes_about', 'company_codes_lineage', 'company_codes_relevance']]

    1942855
Name: company_codes_association, dtype: int64


<h2><b>Creating a profile table for validity</b></h2>

In [5]:
#For validating, I will be taking each unique company code in all of the columns and checking to see if each one is in the company codes dictionary
#The dataframe below will keep track of the % of valid company codes
profile = pd.DataFrame({"Validity": np.zeros(len(companies.columns))}).set_index(companies.columns)
profile

Unnamed: 0,Validity
company_codes,0.0
company_codes_occur,0.0
company_codes_about,0.0
company_codes_lineage,0.0
company_codes_relevance,0.0


<h2><b>Creating validity function</b></h2>

In [8]:
#Here is the validity function I will be using
#returns the sum of True and divides by the length of the unique list
def checkValidity(ls, col = code_dict.code.tolist()):
    return sum([code in col for code in ls]) / len(ls)

<h2><b>Getting unique codes for each column</b></h2>

In [9]:
#Getting the unique company codes
unique_company_codes = set()
for value in companies['company_codes']:
    unique_company_codes.update(value.split(","))

#Convert set back to list
unique_company_codes = list(unique_company_codes)
unique_company_codes = unique_company_codes[1:] #The first element was '', so I didn't include it in the final list
unique_company_codes = [word.upper() for word in unique_company_codes]
print(unique_company_codes[0:10])
print("There are {} unique company codes".format(len(unique_company_codes)))

['ELECNE', 'KUKXBV', 'FRWHTH', 'PFROGI', 'CHSUN', 'LZBIOC', 'TISST', 'MINDA', 'ITDRST', 'WSGRPU']
There are 73688 unique company codes


In [13]:
#Unique companies from company_codes_occur
unique_companies_occur = set()

for value in df['company_codes_occur']:
    unique_companies_occur.update(value.split(","))

unique_companies_occur = list(unique_companies_occur)
unique_companies_occur = unique_companies_occur[1:]
unique_companies_occur = [word.upper() for word in unique_companies_occur]
print(unique_companies_occur[0:10])
print("There are {} unique companies in unique_companies_occur".format(len(unique_companies_occur))) 

['USACMM', 'SXCHIC', 'HCAHN', 'HATTFN', 'DDEBCI', 'WHREIT', 'UFEDTE', 'NYXGGL', 'ADISIG', 'HKELEC']
There are 62381 unique companies in unique_companies_occur


In [14]:
#unique companies from company_codes_about
unique_companies_about = set()

for value in df['company_codes_about']:
    unique_companies_about.update(value.split(","))

unique_companies_about = list(unique_companies_about)
unique_companies_about = unique_companies_about[1:]
unique_companies_about = [word.upper() for word in unique_companies_about]
print(unique_companies_about[0:10])
print("There are {} unique companies in unique_companies_about".format(len(unique_companies_about)))

['USACMM', 'BRKLC', 'SXCHIC', 'HATTFN', 'WHREIT', 'UFEDTE', 'AMRTSL', 'DHOSPC', 'PANCHN', 'UNGLOC']
There are 30780 unique companies in unique_companies_about


In [42]:
#unique companies from company_codes_relevance
unique_companies_relevance = set()

for value in df['company_codes_relevance']:
    unique_companies_relevance.update(value.split(","))

unique_companies_relevance = list(unique_companies_relevance)
unique_companies_relevance = unique_companies_relevance[1:]
unique_companies_relevance = [word.upper() for word in unique_companies_relevance]
print(unique_companies_relevance[0:10])
print("There are {} unique companies in unique_companies_relevance".format(len(unique_companies_relevance)))


['USACMM', 'SXCHIC', 'HCAHN', 'HATTFN', 'EESYSI', 'CYTVAB', 'WHREIT', 'UFEDTE', 'QMTLCI', 'NYXGGL']
There are 66451 unique companies in unique_companies_relevance


In [16]:
#unique companies from company_codes_lineage
unique_companies_lineage = set()

for value in df['company_codes_lineage']:
    unique_companies_lineage.update(value.split(","))

unique_companies_lineage = list(unique_companies_lineage)
unique_companies_lineage = unique_companies_lineage[1:]

#Convert to uppercase bc data dictionary has all codes in upper case
unique_companies_lineage = [word.upper() for word in unique_companies_lineage]
print(unique_companies_lineage[0:10])
print("There are {} unique companies in unique_companies_lineage".format(len(unique_companies_lineage)))

['MMRRDC', 'TUDB', 'CHINAC', 'MZZNML', 'BARDCR', 'TMMHLP', 'HILIND', 'HEALLC', 'GLBLHC', 'LINPL']
There are 3467 unique companies in unique_companies_lineage


<h2><b>Loading in company code dictionary</b></h2>


In [7]:
#Uploading the data dictionary into a dataframe
code_dict = pd.read_csv("../data/original/companies.csv")

<h2><b>Checking validity for each column and applying the result to the profile table</b></h2>

In [23]:
#print(checkValidity(unique_companies_lineage))
profile.iloc[3] = checkValidity(unique_companies_lineage)

In [17]:
profile.iloc[0] = checkValidity(unique_company_codes)

In [21]:
profile.iloc[2] = checkValidity(unique_companies_about)

In [19]:
profile.iloc[1] = checkValidity(unique_companies_occur)

In [43]:
profile.iloc[4] = checkValidity(unique_companies_relevance)

<h2><b>Company Code Validity Results</b></h2>

In [81]:
profile

Unnamed: 0,Validity
company_codes,0.868595
company_codes_occur,0.89165
company_codes_about,0.994087
company_codes_lineage,0.993654
company_codes_relevance,0.860634


<h2><b>Getting the invalid company codes</b></h2>

In [10]:
#Getting all the invalid company codes
invalid_company_codes = np.array([])
for co in unique_company_codes:
    if co not in code_dict.code.tolist():
        invalid_company_codes = np.append(invalid_company_codes, co)

In [11]:
print("There are {} company codes in the company codes column that are not in the dictionary".format(len(invalid_company_codes)))

There are 9683 company codes in the company codes column that are not in the dictionary


<h2><b>Double checking to make sure that no codes in the invalid_company_codes list are valid (Should get 0% valid)</b></h2>

In [15]:
print(checkValidity(invalid_company_codes))

0.0


<h2><b>Listing some of the invalid codes (aka codes in the dataset but not in the dictionary)</b></h2>

In [14]:
invalid_company_codes[0:30]

array(['KUKXBV', 'AMSFFRA', 'LUNFCI', 'GANCMM', 'INSTLC', 'BZHTAKX',
       'BFLMII', 'LVDTIN', 'DALNGEZ', 'CUCMAL', 'CRPZJHJ', 'AHMUCPH',
       'CADRHL', 'RHALUM', 'AWTRA', 'AKVYHCU', 'ORTHVT', 'KLINGC',
       'KLICO', 'REESSQ', 'WELHGL', 'MOCENU', 'APPUDYG', 'COJZCJA',
       'CSUFFR', 'OCOUHD', 'GNTXUI', 'NOVPLL', 'NIKOUI', 'PITTOI'],
      dtype='<U32')

In [18]:
df['Row'] = np.arange(0, len(df)) 

<h2><b>Filtering through the dataset and keeping track of rows with at least one invalid company in them</b></h2>

In [160]:
#subset1 = df[0:100000]

In [164]:
invalid_row = []
for row in df.itertuples():
    for code in row.company_codes.split(","):
        if code.upper() in invalid_company_codes:
            invalid_row.append(row.Row)

In [188]:
print("There are {} rows with at least one invalid company code in the company_codes column, which is about {}% of the entire dataset".format(len(invalid_row),  len(invalid_row) / len(df) * 100))

There are 231025 rows with at least one invalid company code in the company_codes column, which is about 11.891005762138708% of the entire dataset


In [168]:
#First 100 invalid rows
print(invalid_row[0:100])

[17, 19, 35, 35, 38, 59, 67, 68, 70, 70, 72, 94, 94, 108, 121, 134, 139, 146, 146, 159, 161, 161, 164, 184, 184, 201, 203, 211, 211, 251, 251, 255, 263, 263, 263, 263, 277, 283, 316, 322, 331, 341, 347, 379, 379, 390, 412, 428, 434, 434, 434, 434, 439, 458, 460, 462, 462, 464, 509, 519, 532, 538, 547, 560, 577, 612, 613, 629, 639, 639, 645, 653, 653, 653, 677, 695, 703, 714, 714, 735, 746, 746, 761, 770, 780, 780, 831, 831, 832, 832, 872, 878, 884, 890, 890, 890, 906, 906, 921, 923]
