This is a DataLoader that helps you read and clean data from: Stock ticker excel files(company name, ticker, exchange) and Patent applicant CSV files(companies applying for patents). Exploration Phase

In [2]:

import pandas as pd
import logging
from pathlib import Path
import re, unicodedata

pd.set_option("display.max_columns", None)


In [3]:
#All the file paths 
BASE_DIR      = Path.cwd().parents[0]      #set the cuurent directory to the root of the project folder
RAW_DATA_DIR  = BASE_DIR / "data" / "raw"    
PROCESSED_DIR = BASE_DIR / "data" / "processed"

stock_xlsx  = RAW_DATA_DIR / "Yahoo Ticker Symbols - September 2017.xlsx"
patent_csv  = PROCESSED_DIR / "table1.csv"

stock_xlsx, patent_csv








(WindowsPath('c:/Users/shresthn/Desktop/patent_value_project/data/raw/Yahoo Ticker Symbols - September 2017.xlsx'),
 WindowsPath('c:/Users/shresthn/Desktop/patent_value_project/data/processed/table1.csv'))

Play Around with the Stock Data/Clean 


In [4]:
raw_stock = pd.read_excel(stock_xlsx, header= None, nrows = 6) 
raw_stock

Unnamed: 0,0,1,2,3,4,5,6,7
0,Yahoo Stock Tickers,,,,,,,
1,http://investexcel.net,,,,,,,
2,,,,,,,,
3,Ticker,Name,Exchange,Category Name,Country,,,
4,OEDV,"Osage Exploration and Development, Inc.",PNK,,USA,,,Samir Khan
5,AAPL,Apple Inc.,NMS,Electronic Equipment,USA,,,simulationconsultant@gmail.com


In [5]:
#Load the sheet correctly
stocks =  pd.read_excel(stock_xlsx, header=3)   #name of the column starts from row 3
print(stocks.shape)
stocks.head()

(106328, 8)


Unnamed: 0,Ticker,Name,Exchange,Category Name,Country,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,OEDV,"Osage Exploration and Development, Inc.",PNK,,USA,,,Samir Khan
1,AAPL,Apple Inc.,NMS,Electronic Equipment,USA,,,simulationconsultant@gmail.com
2,BAC,Bank of America Corporation,NYQ,Money Center Banks,USA,,,
3,AMZN,"Amazon.com, Inc.",NMS,Catalog & Mail Order Houses,USA,,,This ticker symbol list was downloaded from
4,T,AT&T Inc.,NYQ,Telecom Services - Domestic,USA,,,http://investexcel.net/all-yahoo-finance-stock...


In [6]:
col_we_want =  ["Ticker", "Name", "Exchange", "Category Name", "Country"]
stocks = stocks[col_we_want].copy()
stocks.head()

Unnamed: 0,Ticker,Name,Exchange,Category Name,Country
0,OEDV,"Osage Exploration and Development, Inc.",PNK,,USA
1,AAPL,Apple Inc.,NMS,Electronic Equipment,USA
2,BAC,Bank of America Corporation,NYQ,Money Center Banks,USA
3,AMZN,"Amazon.com, Inc.",NMS,Catalog & Mail Order Houses,USA
4,T,AT&T Inc.,NYQ,Telecom Services - Domestic,USA


In [7]:
#rename the columns 

stocks.columns = ["ticker", "company_name", "exchange", "category", "country"]
stocks.head()

Unnamed: 0,ticker,company_name,exchange,category,country
0,OEDV,"Osage Exploration and Development, Inc.",PNK,,USA
1,AAPL,Apple Inc.,NMS,Electronic Equipment,USA
2,BAC,Bank of America Corporation,NYQ,Money Center Banks,USA
3,AMZN,"Amazon.com, Inc.",NMS,Catalog & Mail Order Houses,USA
4,T,AT&T Inc.,NYQ,Telecom Services - Domestic,USA


In [8]:
# Drop empty or NaN tickers/companies
before = len(stocks)
print(before)
stocks = stocks.dropna(subset=["ticker","company_name"])
stocks = stocks[(stocks['ticker'].str.strip() != '') & (stocks['company_name'].str.strip() != '')]

after = len(stocks)
print(after)

106328
96676


In [9]:
#How many companies per exchange 
stocks['exchange'].value_counts().head()


#Which country dominates
stocks['country'].value_counts().head()






country
USA        22169
Germany    21365
France     11176
India       8984
Canada      4145
Name: count, dtype: int64

Play Around with the Patent Data/Clean 


In [10]:
patents = pd.read_csv(patent_csv)
print(patents.shape)
patents.head()

(556194, 5)


Unnamed: 0,Company,Patents_Applied_2010,Patents_Applied_2011,Patents_Applied_2012,Patents_Applied_2013
0,"!MAGINETHIS RENOVATIONS, LLC",1,0,0,0
1,#3248362 NOVA SCOTIA LIMITED,0,1,0,0
2,"& DEVELOPMENT CENTRE, METAL INDUSTRIES RESEARCH",1,0,0,0
3,"& DEVELOPMENT FOUNDATION LTD., TECHNION RESEARCH",1,0,0,0
4,"& TECHNOLOGY CORPORATION, TOSHIBA LIGHTING",1,0,0,0


In [11]:

#Sum the yearly columns 
year_cols = [c for c in patents.columns if c.startswith("P")]
patents["total_patents"] = patents[year_cols].sum(axis=1)
patents

Unnamed: 0,Company,Patents_Applied_2010,Patents_Applied_2011,Patents_Applied_2012,Patents_Applied_2013,total_patents
0,"!MAGINETHIS RENOVATIONS, LLC",1,0,0,0,1
1,#3248362 NOVA SCOTIA LIMITED,0,1,0,0,1
2,"& DEVELOPMENT CENTRE, METAL INDUSTRIES RESEARCH",1,0,0,0,1
3,"& DEVELOPMENT FOUNDATION LTD., TECHNION RESEARCH",1,0,0,0,1
4,"& TECHNOLOGY CORPORATION, TOSHIBA LIGHTING",1,0,0,0,1
...,...,...,...,...,...,...
556189,"Åman Bergkvist, Ivar",1,0,0,0,1
556190,"Åsberg, Kenneth",1,0,0,0,1
556191,"Åslund, Bengt Leonard",0,1,0,0,1
556192,"Åstrand, Daniel",1,0,0,0,1


In [None]:
#Sort by activity
patents = patents.sort_values('total_patents', ascending=False)
# Average total_patents for the top 1 000 companies
avg_top1000 = (
    patents               
        .head(1000)       # grab the first 1 000 rows
        ["total_patents"] 
        .mean()           # arithmetic mean
)

print(f"Average patents per company (top 1 000): {avg_top1000:,.1f}")


Average patents per company (top 1 000): 32.6


In [13]:
#rename the company names
patents = patents.rename(columns={"Company": "company", "Patents_Applied_2010": "y2010", "Patents_Applied_2011": "y2011", "Patents_Applied_2012": "y2012", "Patents_Applied_2013": "y2013" })
patents.head(10)

Unnamed: 0,company,y2010,y2011,y2012,y2013,total_patents
182904,"Halliburton Energy Services, Inc.",32,72,208,419,731
354780,Nestec S.A.,170,187,177,110,644
425900,SNECMA,114,114,122,133,483
433953,Sanofi-Aventis Deutschland GmbH,164,184,104,0,452
25584,BASF SE,153,57,90,99,399
360703,Novartis AG,88,86,95,94,363
452976,Siemens Aktiengesellschaft,88,72,93,95,348
94372,Colgate-Palmolive Company,118,84,79,40,321
119836,Dow Global Technologies LLC,51,57,78,96,282
25777,BAYER INTELLECTUAL PROPERTY GMBH,21,82,105,25,233


In [14]:
#joint table1 and stock by company_names to see how many exact matches you get
exact_matches = pd.merge(
    patents,
    stocks,
    left_on='company',          
    right_on='company_name',
    how='inner',
    suffixes=('_patstat', '_stock')   
)

print(f"Exact matches found: {len(exact_matches)}")

# show both company name columns alongside ticker info
exact_matches[['company', 'company_name', 'ticker', 'exchange']].head(100)



Exact matches found: 4672


Unnamed: 0,company,company_name,ticker,exchange
0,BASF SE,BASF SE,BASFY,PNK
1,BASF SE,BASF SE,BAS.DE,GER
2,BASF SE,BASF SE,BFA.L,LSE
3,BASF SE,BASF SE,BASA.F,FRA
4,BASF SE,BASF SE,BFFAF,PNK
...,...,...,...,...
95,"Sumitomo Electric Industries, Ltd.","Sumitomo Electric Industries, Ltd.",SMTOY,PNK
96,"Sumitomo Electric Industries, Ltd.","Sumitomo Electric Industries, Ltd.",SMTOF,PNK
97,"Sumitomo Electric Industries, Ltd.","Sumitomo Electric Industries, Ltd.",SMO.F,FRA
98,E. I. du Pont de Nemours and Company,E. I. du Pont de Nemours and Company,DD,NYQ


In [15]:
def clean_text(text):
    if pd.isna(text) or not text:
        return ""
    s = str(text).lower().strip()
    # expand & + @
    s = s.replace('&',' and ').replace('+',' plus ').replace('@',' at ')
    # strip accents
    s = unicodedata.normalize('NFD', s)
    s = "".join(ch for ch in s if unicodedata.category(ch) != 'Mn')
    # drop punctuation
    s = re.sub(r'[^\w\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s


In [16]:
suffixes = ["inc","incorporated","corp","ltd","llc","co","company","group","technologies","international"]
# sort longest first:
suffixes = sorted(suffixes, key=len, reverse=True)

def remove_suffixes(s):
    for suf in suffixes:
        s = re.sub(rf'\b{suf}\b','', s)
    return re.sub(r'\s+',' ', s).strip()


In [17]:
clean = clean_text("Foo Technologies, Inc.")
remove_suffixes(clean)  
# → 'foo'


'foo'

In [18]:
abbrev_map = {
  'ibm': 'international business machines',
  'hp':  'hewlett packard',
  'msft':'microsoft',
  'aapl':'apple',
  
}

# longest first
items = sorted(abbrev_map.items(), key=lambda x: len(x[0]), reverse=True)

def apply_abbrevs(s):
    for abbr, full in items:
        # whole-word replace
        s = re.sub(rf'\b{re.escape(abbr)}\b', full, s)
    return s


In [19]:
s = clean_text("Apple Inc. (AAPL)")
s = remove_suffixes(s)
apply_abbrevs(s)
# → 'apple'


'apple apple'