In [2]:
import numpy as np
import pandas as pd
import os

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from scipy.special import rel_entr
from scipy.stats import entropy
import scipy.stats
import math
import warnings 
warnings.filterwarnings("ignore") 

from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score

In [3]:
# 使用GPU
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [4]:
! nvidia-smi

Sun Jun 12 19:35:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 30%   39C    P8    21W / 260W |     22MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 29%   37C    P8    25W / 260W |      8MiB / 11019MiB |      0%      Default |
|       

# MongoDB

In [5]:
from pymongo import MongoClient
import pymongo

In [6]:
mongoURI = "mongodb://%s:%s@%s/%s?authMechanism=SCRAM-SHA-1" % ("eva", "eva_30241", "140.117.69.70:30241", "eva")

try:
    conn = pymongo.MongoClient(mongoURI)
    db = conn.eva
    # db_de = db.patent_de
    # db_us = db.patent_us
    # db_cn = db.patent_cn

    db_rm_cn_2012 = db.rm_patent_cn_2012
    db_rm_us_2012 = db.rm_patent_us_2012
    db_rm_de_2012 = db.rm_patent_de_2012

    
except errors.ConnectionFailure as err:
    print(err)

# 2012

## Company數量

In [7]:
data_assignee = db_rm_de_2012.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])

In [8]:
df_family_de = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [9]:
data_assignee = db_rm_cn_2012.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_cn = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [10]:
data_assignee = db_rm_us_2012.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_us = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [11]:
final_df = pd.concat([df_family_de,df_family_cn,df_family_us],axis=0)

In [12]:
# 取總數前15家公司
final_df.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)

Unnamed: 0,company,count
32,Jiangsu IoT Research and Development Center,100
42,Nanjing IoT Sensor Technology Co Ltd,52
9,Cisco Technology Inc,33
8,China core Microelectronics Technology Chengdu...,25
40,NANJING AQUAPEL IOT CO Ltd,20
3,"BEIJING GOLDENET IOT TECHNOLOGY CO., LTD.",18
63,Xerox Corp,17
17,"Gaoxing Zhilian Technology Co., Ltd.",17
27,Institute of Microelectronics of CAS,13
31,Jiangsu CAS IGBT Technology Co Ltd,12


In [13]:
com_df = final_df.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(19)
com_df = com_df.append({"company":"Samsung Electronics Co Ltd","count":4}, ignore_index=True)
com_df.reset_index(drop=True, inplace=True)

In [14]:
com_ls = ['Chengdu Qinchuan IoT Technology Co Ltd',
 'China Petroleum and Chemical Corp',
 'Cisco Technology Inc',
 'Gaoxing Zhilian Technology Co., Ltd.',
 'Huawei Technologies Co Ltd',
 'Institute of Microelectronics of CAS',
 'Jiangsu CAS IGBT Technology Co Ltd',
 'Jiangsu IoT Research and Development Center',
 'Kingfa Science and Technology Co Ltd',
 'Nanjing IoT Sensor Technology Co Ltd',
 'Seiko Epson Corp',
 'Shanghai Kingfa Science and Technology Co Ltd',
 'Xerox Corp',
 'ZTE Corp']

In [15]:
com_df = com_df[com_df.company.isin(com_ls)]

In [17]:
com_df["count_log"] = com_df["count"].apply(lambda x: np.log(x))
com_df.reset_index(drop=True, inplace=True)
com_df

Unnamed: 0,company,count,count_log
0,Jiangsu IoT Research and Development Center,100,4.60517
1,Nanjing IoT Sensor Technology Co Ltd,52,3.951244
2,Cisco Technology Inc,33,3.496508
3,Xerox Corp,17,2.833213
4,"Gaoxing Zhilian Technology Co., Ltd.",17,2.833213
5,Institute of Microelectronics of CAS,13,2.564949
6,Jiangsu CAS IGBT Technology Co Ltd,12,2.484907
7,Seiko Epson Corp,10,2.302585
8,ZTE Corp,10,2.302585
9,Kingfa Science and Technology Co Ltd,9,2.197225


公司名稱正規化

In [18]:
com_df["company"] = com_df.company.apply(lambda x: x.replace(" Inc","")\
                                                    .replace(" Co Ltd","")\
                                                    .replace(" Co. Ltd.","")\
                                                    .replace(" Co., Ltd.","")\
                                                    .replace(" Ltd","")\
                                                    .replace(" Corp","")\
                                                    .replace("..","")\
                                                    .replace("International Business Machines","IBM")\
                                                    .replace("Nippon Telegraph and Telephone","Nippon Telegraph & Tel")\
                                                    .replace("Alibaba Group Holding Ltd","Alibaba")\
                                                    .replace("ZTE Intelligent IoT Technology","ZTE")\
                                                    .replace("AT&T Intellectual Property I LP","AT&T")\
                                                    .replace("Microsoft Technology Licensing LLC","Microsoft")\
                                                    .replace("Telefonaktiebolaget LM Ericsson AB","Ericsson")\
                                                    .replace("Cisco Technology","Cisco Systems")\
                                                    .replace("Verizon Patent and Licensing","Verizon Communications")\
                                                    .replace("Nokia Technologies Oy","Nokia")\
                                                    .replace("China Mobile Communications Group","China Mobile")\
                                                    .replace("Nokia Solutions and Networks Oy","Nokia"))

In [19]:
com_df

Unnamed: 0,company,count,count_log
0,Jiangsu IoT Research and Development Center,100,4.60517
1,Nanjing IoT Sensor Technology,52,3.951244
2,Cisco Systems,33,3.496508
3,Xerox,17,2.833213
4,Gaoxing Zhilian Technology,17,2.833213
5,Institute of Microelectronics of CAS,13,2.564949
6,Jiangsu CAS IGBT Technology,12,2.484907
7,Seiko Epson,10,2.302585
8,ZTE,10,2.302585
9,Kingfa Science and Technology,9,2.197225


In [20]:
com_ls = list(com_df.company)

In [21]:
com_ls.remove('Samsung Electronics')

ValueError: list.remove(x): x not in list

# Data 2012

富比士2000資料

In [22]:
f_data = pd.read_csv("./Forbes/Forbes_2012.csv")

In [23]:
f_data[f_data["Industry"]=="Semiconductors"]

Unnamed: 0,Company,Industry,Country,Market Value,Sales,Profits,Assets,Rank,Forbes Webpage,Profits as % of Assets,...,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20
25,Samsung Electronics,Semiconductors,South Korea,162.0,142.4,11.5,133.7,26,http://www.forbes.com/companies/samsung-electr...,0.086013,...,,,,,,,,,,
85,Intel,Semiconductors,United States,138.5,54.0,12.9,71.1,85,http://www.forbes.com/companies/intel/,0.181435,...,,,,,,,,,,
283,Taiwan Semiconductor,Semiconductors,Taiwan,71.1,14.1,4.4,25.3,284,http://www.forbes.com/companies/taiwan-semicon...,0.173913,...,,,,,,,,,,
359,Texas Instruments,Semiconductors,United States,37.8,13.7,2.2,20.5,360,http://www.forbes.com/companies/texas-instrume...,0.107317,...,,,,,,,,,,
583,Applied Materials,Semiconductors,United States,16.6,10.0,1.5,13.6,583,http://www.forbes.com/companies/applied-materi...,0.110294,...,,,,,,,,,,
689,ASML Holding,Semiconductors,Netherlands,20.0,7.3,1.9,9.4,690,http://www.forbes.com/companies/asml-holding/,0.202128,...,,,,,,,,,,
782,Broadcom,Semiconductors,United States,20.6,7.4,0.9,9.0,783,http://www.forbes.com/companies/broadcom/,0.1,...,,,,,,,,,,
867,Tokyo Electron,Semiconductors,Japan,10.3,8.0,0.9,9.5,868,http://www.forbes.com/companies/tokyo-electron/,0.094737,...,,,,,,,,,,
948,Hynix Semiconductor,Semiconductors,South Korea,17.8,9.4,-0.1,14.9,948,http://www.forbes.com/companies/hynix-semicond...,-0.006711,...,,,,,,,,,,
953,STMicroelectronics,Semiconductors,Switzerland,7.5,9.0,0.6,10.9,954,http://www.forbes.com/companies/stmicroelectro...,0.055046,...,,,,,,,,,,


## Forbes list  
- 同Industry：1
- 同Sector：2
- 有出現在company list：3
- Company list上有但Forbes沒有：4

In [24]:
forb_ls =[]

for i in com_ls:
    if (f_data["Company"]==i).any():
        if (f_data[f_data.Company==i]["Industry"]=="Semiconductors").any():
            forb_ls.append(1)
        # elif (f_data[f_data.Company==i]["Sector"]=="Information Technology").any():
        #     forb_ls.append(2)
        else:
            forb_ls.append(3)
    else:
        forb_ls.append(4)
    

In [25]:
forb_ls

[4, 4, 3, 3, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4]

## Company list competitor

In [26]:
competitor_ls=[3, 3, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2]

## 計算NMI

In [27]:
# (labels_true, labels_pred)
nmi = normalized_mutual_info_score(forb_ls,competitor_ls)
ari = adjusted_rand_score(forb_ls,competitor_ls)

print("nmi: ",nmi)
print("ari: ", ari)

nmi:  0.10153760376096907
ari:  -0.02631578947368421
