In [80]:
import numpy as np
import pandas as pd
import os

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from scipy.special import rel_entr
from scipy.stats import entropy
import scipy.stats
import math
import warnings 
warnings.filterwarnings("ignore") 

from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score

In [81]:
# 使用GPU
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [82]:
! nvidia-smi

Sun Jun 12 17:41:27 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 30%   39C    P8    21W / 260W |     22MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 29%   37C    P8    25W / 260W |      8MiB / 11019MiB |      0%      Default |
|       

# MongoDB

In [83]:
from pymongo import MongoClient
import pymongo

In [84]:
mongoURI = "mongodb://%s:%s@%s/%s?authMechanism=SCRAM-SHA-1" % ("eva", "eva_30241", "140.117.69.70:30241", "eva")

try:
    conn = pymongo.MongoClient(mongoURI)
    db = conn.eva
    # db_de = db.patent_de
    # db_us = db.patent_us
    # db_cn = db.patent_cn

    db_rm_cn_2020 = db.rm_patent_cn_2020
    db_rm_us_2020 = db.rm_patent_us_2020
    db_rm_de_2020 = db.rm_patent_de_2020

    
except errors.ConnectionFailure as err:
    print(err)

# 2020

## Company數量

In [85]:
data_assignee = db_rm_de_2020.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])

In [86]:
df_family_de = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [87]:
data_assignee = db_rm_cn_2020.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_cn = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [88]:
data_assignee = db_rm_us_2020.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_us = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [89]:
final_df = pd.concat([df_family_de,df_family_cn,df_family_us],axis=0)

In [90]:
# 取總數前15家公司
final_df.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)

Unnamed: 0,company,count
53,Qualcomm Inc,1907
57,Samsung Electronics Co Ltd,796
40,Intel Corp,558
36,Huawei Technologies Co Ltd,530
43,LG Electronics Inc,354
41,International Business Machines Corp,225
6,Apple Inc,195
60,Shanghai Langbo Communication Technology Co Ltd,182
46,Micron Technology Inc,154
1,AT&T Intellectual Property I LP,147


In [91]:
com_df = final_df.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)
com_df.reset_index(drop=True, inplace=True)

In [92]:
com_df["count_log"] = com_df["count"].apply(lambda x: np.log(x))
com_df

Unnamed: 0,company,count,count_log
0,Qualcomm Inc,1907,7.553287
1,Samsung Electronics Co Ltd,796,6.679599
2,Intel Corp,558,6.324359
3,Huawei Technologies Co Ltd,530,6.272877
4,LG Electronics Inc,354,5.869297
5,International Business Machines Corp,225,5.4161
6,Apple Inc,195,5.273
7,Shanghai Langbo Communication Technology Co Ltd,182,5.204007
8,Micron Technology Inc,154,5.036953
9,AT&T Intellectual Property I LP,147,4.990433


公司名稱正規化

In [93]:
com_df["company"] = com_df.company.apply(lambda x: x.replace(" Inc","")\
                                                    .replace(" Co Ltd","")\
                                                    .replace(" Co. Ltd.","")\
                                                    .replace(" Co., Ltd.","")\
                                                    .replace(" Ltd","")\
                                                    .replace(" Corp","")\
                                                    .replace("..","")\
                                                    .replace("International Business Machines","IBM")\
                                                    .replace("Nippon Telegraph and Telephone","Nippon Telegraph & Tel")\
                                                    .replace("Alibaba Group Holding Ltd","Alibaba")\
                                                    .replace("ZTE Intelligent IoT Technology","ZTE")\
                                                    .replace("AT&T Intellectual Property I LP","AT&T")\
                                                    .replace("Microsoft Technology Licensing LLC","Microsoft")\
                                                    .replace("Telefonaktiebolaget LM Ericsson AB","Ericsson")\
                                                    .replace("Cisco Technology","Cisco Systems")\
                                                    .replace("Verizon Patent and Licensing","Verizon Communications"))

In [94]:
com_df

Unnamed: 0,company,count,count_log
0,Qualcomm,1907,7.553287
1,Samsung Electronics,796,6.679599
2,Intel,558,6.324359
3,Huawei Technologies,530,6.272877
4,LG Electronics,354,5.869297
5,IBM,225,5.4161
6,Apple,195,5.273
7,Shanghai Langbo Communication Technology,182,5.204007
8,Micron Technology,154,5.036953
9,AT&T,147,4.990433


In [95]:
com_ls = list(com_df.company)

In [96]:
com_ls.remove('Samsung Electronics')

# Data 2020

富比士2000資料

In [97]:
f_data = pd.read_csv("./Forbes/Forbes_2020.csv")

In [98]:
f_data[f_data["Industry"]=="Semiconductors"]

Unnamed: 0,Rank,Company,Country/Territory,Sales,Profits,Assets,Market Value,Sector,Industry
15,16,Samsung Electronics,South Korea,$197.6 B,$18.4 B,$304.9 B,$278.7 B,Information Technology,Semiconductors
37,38,Intel,United States,$75.7 B,$22.7 B,$147.7 B,$254 B,Information Technology,Semiconductors
107,108,Taiwan Semiconductor,Taiwan,$37.8 B,$13 B,$77.5 B,$265.5 B,Information Technology,Semiconductors
191,192,Broadcom,United States,$22.7 B,$2.6 B,$81 B,$108.6 B,Information Technology,Semiconductors
257,258,Qualcomm,United States,$24.7 B,$4 B,$31.9 B,$88.5 B,Information Technology,Semiconductors
276,277,Micron Technology,United States,$19.6 B,$2.3 B,$49.6 B,$53.3 B,Information Technology,Semiconductors
295,296,SK Hynix,South Korea,$23.2 B,$1.3 B,$55.2 B,$47 B,Information Technology,Semiconductors
360,361,ASML Holding,Netherlands,$13.4 B,$2.9 B,$25.5 B,$124.5 B,Information Technology,Semiconductors
415,416,Texas Instruments,United States,$14.1 B,$4.9 B,$17.3 B,$106.5 B,Information Technology,Semiconductors
435,436,Applied Materials,United States,$15 B,$2.8 B,$19.8 B,$45.6 B,Information Technology,Semiconductors


## Forbes list  
- 同Industry：1
- 同Sector：2
- 有出現在company list：3
- Company list上有但Forbes沒有：4

In [99]:
forb_ls =[]

for i in com_ls:
    if (f_data["Company"]==i).any():
        if (f_data[f_data.Company==i]["Industry"]=="Semiconductors").any():
            forb_ls.append(1)
        elif (f_data[f_data.Company==i]["Sector"]=="Information Technology").any():
            forb_ls.append(2)
        else:
            forb_ls.append(3)
    else:
        forb_ls.append(4)
    

In [100]:
forb_ls

[1, 1, 4, 3, 2, 2, 4, 1, 3, 2, 3, 2, 2, 4]

## Company list competitor

In [101]:
competitor_ls=[1, 1, 1, 1, 1, 1, 2, 1, 4, 4, 4, 4, 4, 4]

## 計算NMI

In [102]:
# (labels_true, labels_pred)
nmi = normalized_mutual_info_score(forb_ls,competitor_ls)
ari = adjusted_rand_score(forb_ls,competitor_ls)

print("nmi: ",nmi)
print("ari: ", ari)

nmi:  0.2535498742575949
ari:  0.024195765740995325
