In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from scipy.special import rel_entr
from scipy.stats import entropy
import scipy.stats
import math
import warnings 
warnings.filterwarnings("ignore") 

from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score

In [2]:
# 使用GPU
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
! nvidia-smi

Wed Jun 29 01:30:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 32%   41C    P8    21W / 260W |     22MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 30%   38C    P8    24W / 260W |      8MiB / 11019MiB |      0%      Default |
|       

# MongoDB

In [4]:
from pymongo import MongoClient
import pymongo

In [5]:
mongoURI = "mongodb://%s:%s@%s/%s?authMechanism=SCRAM-SHA-1" % ("eva", "eva_30241", "140.117.69.70:30241", "eva")

try:
    conn = pymongo.MongoClient(mongoURI)
    db = conn.eva
    # db_de = db.patent_de
    # db_us = db.patent_us
    # db_cn = db.patent_cn

    db_rm_cn_2020 = db.rm_patent_cn_2020
    db_rm_us_2020 = db.rm_patent_us_2020
    db_rm_de_2020 = db.rm_patent_de_2020

    
except errors.ConnectionFailure as err:
    print(err)

# 2020

## Company數量

In [6]:
data_assignee = db_rm_de_2020.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])

In [7]:
df_family_de = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [8]:
data_assignee = db_rm_cn_2020.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_cn = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [9]:
data_assignee = db_rm_us_2020.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_us = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [10]:
# final_df = pd.concat([df_family_de,df_family_cn,df_family_us],axis=0)

In [11]:
# 取總數前15家公司
df_family_de.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)

Unnamed: 0,company,count
12,Intel Corp,84
17,Samsung Electronics Co Ltd,28
16,Robert Bosch GmbH,10
0,Apple Inc,5
14,LG Electronics Inc,5
9,Hyundai Mobis Co Ltd,5
27,Vega Grieshaber KG,5
8,Hewlett Packard Enterprise Development LP,4
25,Taiwan Semiconductor Manufacturing Co TSMC Ltd,4
26,Toyota Motor Corp,4


In [12]:
com_df = df_family_de.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)
com_df.reset_index(drop=True, inplace=True)

In [13]:
com_df["count_log"] = com_df["count"].apply(lambda x: np.log(x))
com_df

Unnamed: 0,company,count,count_log
0,Intel Corp,84,4.430817
1,Samsung Electronics Co Ltd,28,3.332205
2,Robert Bosch GmbH,10,2.302585
3,Apple Inc,5,1.609438
4,LG Electronics Inc,5,1.609438
5,Hyundai Mobis Co Ltd,5,1.609438
6,Vega Grieshaber KG,5,1.609438
7,Hewlett Packard Enterprise Development LP,4,1.386294
8,Taiwan Semiconductor Manufacturing Co TSMC Ltd,4,1.386294
9,Toyota Motor Corp,4,1.386294


公司名稱正規化

In [14]:
com_df["company"] = com_df.company.apply(lambda x: x.replace(" Inc","")\
                                                    .replace(" Co Ltd","")\
                                                    .replace(" Co. Ltd.","")\
                                                    .replace(" Co., Ltd.","")\
                                                    .replace(" Ltd","")\
                                                    .replace(" Corp","")\
                                                    .replace("..","")\
                                                    .replace("International Business Machines","IBM")\
                                                    .replace("Nippon Telegraph and Telephone","Nippon Telegraph & Tel")\
                                                    .replace("Alibaba Group Holding Ltd","Alibaba")\
                                                    .replace("ZTE Intelligent IoT Technology","ZTE")\
                                                    .replace("AT&T Intellectual Property I LP","AT&T")\
                                                    .replace("Microsoft Technology Licensing LLC","Microsoft")\
                                                    .replace("Telefonaktiebolaget LM Ericsson AB","Ericsson")\
                                                    .replace("Cisco Technology","Cisco Systems")\
                                                    .replace("Verizon Patent and Licensing","Verizon Communications")\
                                                    .replace("Tencent Technology Shenzhen","Tencent Holdings")\
                                                    .replace("China Mobile Communications Group","China Mobile")\
                                                    .replace("Alipay Hangzhou Information Technology","Alibaba Group")\
                                                    .replace("Beijing Xiaomi Mobile Software","Xiaomi")\
                                                    .replace("Nokia Technologies Oy","Nokia")\
                                                    .replace("Hewlett Packard Enterprise Development LP","Hewlett Packard Enterprise")\
                                                    .replace("Taiwan Semiconductor Manufacturing Co TSMC","Taiwan Semiconductor")\
                                                    .replace("Hyundai Motor Co","Hyundai Motor"))

In [15]:
com_df

Unnamed: 0,company,count,count_log
0,Intel,84,4.430817
1,Samsung Electronics,28,3.332205
2,Robert Bosch GmbH,10,2.302585
3,Apple,5,1.609438
4,LG Electronics,5,1.609438
5,Hyundai Mobis,5,1.609438
6,Vega Grieshaber KG,5,1.609438
7,Hewlett Packard Enterprise,4,1.386294
8,Taiwan Semiconductor,4,1.386294
9,Toyota Motor,4,1.386294


In [16]:
com_ls = list(com_df.company)

In [17]:
com_ls.remove('Samsung Electronics')

# Data 2020

富比士2000資料

In [18]:
f_data = pd.read_csv("../../Forbes/Forbes_2020.csv")

In [19]:
f_data[f_data["Industry"]=="Semiconductors"]

Unnamed: 0,Rank,Company,Country/Territory,Sales,Profits,Assets,Market Value,Sector,Industry
15,16,Samsung Electronics,South Korea,$197.6 B,$18.4 B,$304.9 B,$278.7 B,Information Technology,Semiconductors
37,38,Intel,United States,$75.7 B,$22.7 B,$147.7 B,$254 B,Information Technology,Semiconductors
107,108,Taiwan Semiconductor,Taiwan,$37.8 B,$13 B,$77.5 B,$265.5 B,Information Technology,Semiconductors
191,192,Broadcom,United States,$22.7 B,$2.6 B,$81 B,$108.6 B,Information Technology,Semiconductors
257,258,Qualcomm,United States,$24.7 B,$4 B,$31.9 B,$88.5 B,Information Technology,Semiconductors
276,277,Micron Technology,United States,$19.6 B,$2.3 B,$49.6 B,$53.3 B,Information Technology,Semiconductors
295,296,SK Hynix,South Korea,$23.2 B,$1.3 B,$55.2 B,$47 B,Information Technology,Semiconductors
360,361,ASML Holding,Netherlands,$13.4 B,$2.9 B,$25.5 B,$124.5 B,Information Technology,Semiconductors
415,416,Texas Instruments,United States,$14.1 B,$4.9 B,$17.3 B,$106.5 B,Information Technology,Semiconductors
435,436,Applied Materials,United States,$15 B,$2.8 B,$19.8 B,$45.6 B,Information Technology,Semiconductors


## Forbes list  
- 同Industry：1
- 同Sector：2
- 有出現在company list：3
- Company list上有但Forbes沒有：4

In [20]:
forb_ls =[]

for i in com_ls:
    if (f_data["Company"]==i).any():
        if (f_data[f_data.Company==i]["Industry"]=="Semiconductors").any():
            forb_ls.append(1)
        elif (f_data[f_data.Company==i]["Sector"]=="Information Technology").any():
            forb_ls.append(2)
        else:
            forb_ls.append(3)
    else:
        forb_ls.append(4)
    

In [21]:
forb_ls

[1, 4, 2, 3, 3, 4, 2, 1, 3, 3, 4, 4, 4, 4]

## Company list competitor

In [22]:
competitor_ls=[2, 1, 1, 1, 1, 1, 1, 2, 1, 3, 3, 4, 4, 4]

## 計算NMI

In [23]:
# (labels_true, labels_pred)
nmi = normalized_mutual_info_score(forb_ls,competitor_ls)
ari = adjusted_rand_score(forb_ls,competitor_ls)

print("nmi: ",nmi)
print("ari: ", ari)

nmi:  0.5088439598692969
ari:  0.13545816733067728


## 公司名稱和類別

In [24]:
competitor_df = pd.DataFrame((zip(com_ls, competitor_ls, forb_ls)), columns = ['company', '2020_competitor', '2020_forb'])

In [25]:
competitor_df

Unnamed: 0,company,2020_competitor,2020_forb
0,Intel,2,1
1,Robert Bosch GmbH,1,4
2,Apple,1,2
3,LG Electronics,1,3
4,Hyundai Mobis,1,3
5,Vega Grieshaber KG,1,4
6,Hewlett Packard Enterprise,1,2
7,Taiwan Semiconductor,2,1
8,Toyota Motor,1,3
9,Hyundai Motor,3,3


In [26]:
competitor_df.to_csv("../competitor_df/cpc_de_2020.csv", index=False)