In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from scipy.special import rel_entr
from scipy.stats import entropy
import scipy.stats
import math
import warnings 
warnings.filterwarnings("ignore") 

from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score

In [2]:
# 使用GPU
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
! nvidia-smi

Wed Jun 29 01:31:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 32%   41C    P8    21W / 260W |     22MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 30%   38C    P8    25W / 260W |      8MiB / 11019MiB |      0%      Default |
|       

# MongoDB

In [4]:
from pymongo import MongoClient
import pymongo

In [5]:
mongoURI = "mongodb://%s:%s@%s/%s?authMechanism=SCRAM-SHA-1" % ("eva", "eva_30241", "140.117.69.70:30241", "eva")

try:
    conn = pymongo.MongoClient(mongoURI)
    db = conn.eva
    # db_de = db.patent_de
    # db_us = db.patent_us
    # db_cn = db.patent_cn

    db_rm_cn_2017 = db.rm_patent_cn_2017
    db_rm_us_2017 = db.rm_patent_us_2017
    db_rm_de_2017 = db.rm_patent_de_2017

    
except errors.ConnectionFailure as err:
    print(err)

# 2017

## Company數量

In [6]:
data_assignee = db_rm_de_2017.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])

In [7]:
df_family_de = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [8]:
data_assignee = db_rm_cn_2017.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_cn = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [9]:
data_assignee = db_rm_us_2017.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_us = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [10]:
# final_df = pd.concat([df_family_de,df_family_cn,df_family_us],axis=0)

In [11]:
# 取總數前15家公司
df_family_de.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)

Unnamed: 0,company,count
15,Intel Corp,36
16,Intel IP Corp,25
1,Apple Inc,17
20,Robert Bosch GmbH,11
26,Sony Corp,5
22,Samsung Electronics Co Ltd,5
23,Siemens AG,4
7,Fraunhofer Gesellschaft zur Forderung der Ange...,4
14,Innogy Innovation GmbH,3
18,Osram GmbH,3


In [12]:
com_df = df_family_de.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)
com_df.reset_index(drop=True, inplace=True)

In [13]:
com_df["count_log"] = com_df["count"].apply(lambda x: np.log(x))
com_df

Unnamed: 0,company,count,count_log
0,Intel Corp,36,3.583519
1,Intel IP Corp,25,3.218876
2,Apple Inc,17,2.833213
3,Robert Bosch GmbH,11,2.397895
4,Sony Corp,5,1.609438
5,Samsung Electronics Co Ltd,5,1.609438
6,Siemens AG,4,1.386294
7,Fraunhofer Gesellschaft zur Forderung der Ange...,4,1.386294
8,Innogy Innovation GmbH,3,1.098612
9,Osram GmbH,3,1.098612


公司名稱正規化

In [14]:
com_df["company"] = com_df.company.apply(lambda x: x.replace(" Inc","")\
                                                    .replace(" Co Ltd","")\
                                                    .replace(" Co. Ltd.","")\
                                                    .replace(" Co., Ltd.","")\
                                                    .replace(" Ltd","")\
                                                    .replace(" Corp","")\
                                                    .replace("..","")\
                                                    .replace("International Business Machines","IBM")\
                                                    .replace("Nippon Telegraph and Telephone","Nippon Telegraph & Tel")\
                                                    .replace("Alibaba Group Holding Ltd","Alibaba")\
                                                    .replace("ZTE Intelligent IoT Technology","ZTE")\
                                                    .replace("AT&T Intellectual Property I LP","AT&T")\
                                                    .replace("Microsoft Technology Licensing LLC","Microsoft")\
                                                    .replace("Telefonaktiebolaget LM Ericsson AB","Ericsson")\
                                                    .replace("Cisco Technology","Cisco Systems")\
                                                    .replace("Verizon Patent and Licensing","Verizon Communications")\
                                                    .replace("Tencent Technology Shenzhen","Tencent Holdings")\
                                                    .replace("China Mobile Communications Group","China Mobile")\
                                                    .replace("Alipay Hangzhou Information Technology","Alibaba Group")\
                                                    .replace("Beijing Xiaomi Mobile Software","Xiaomi")\
                                                    .replace("Nokia Technologies Oy","Nokia")\
                                                    .replace("Hewlett Packard Enterprise Development LP","Hewlett Packard Enterprise")\
                                                    .replace("Taiwan Semiconductor Manufacturing Co TSMC","Taiwan Semiconductor")\
                                                    .replace("Hyundai Motor Co","Hyundai Motor")
                                                    .replace("Bayerische Motoren Werke AG","BMW Group")\
                                                    .replace("Infineon Technologies AG","Infineon Technologies"))

In [15]:
com_df

Unnamed: 0,company,count,count_log
0,Intel,36,3.583519
1,Intel IP,25,3.218876
2,Apple,17,2.833213
3,Robert Bosch GmbH,11,2.397895
4,Sony,5,1.609438
5,Samsung Electronics,5,1.609438
6,Siemens AG,4,1.386294
7,Fraunhofer Gesellschaft zur Forderung der Ange...,4,1.386294
8,Innogy Innovation GmbH,3,1.098612
9,Osram GmbH,3,1.098612


In [16]:
com_ls = list(com_df.company)

In [17]:
com_ls.remove('Samsung Electronics')

# Data 2017

富比士2000資料

In [18]:
f_data = pd.read_csv("../../Forbes/Forbes_2017.csv")

In [19]:
f_data[f_data["Industry"]=="Semiconductors"]

Unnamed: 0,Company,Market Value,Revenue,Profits,Assets,Rank,Sector,Industry,Continent,Country,Headquarters,State,CEO,Forbes Webpage,Profits as % of Assets,Profits as % of Revenue
14,Samsung Electronics,254.303,173.995,19.3207,217.068,15,Information Technology,Semiconductors,Asia,South Korea,South Korea,,Oh-Hyun Kwon,http://www.forbes.com/companies/samsung-electr...,0.089008,0.111042
53,Intel,170.35,59.387,10.316,113.327,54,Information Technology,Semiconductors,North America,United States,California,California,Brian M. Krzanich,http://www.forbes.com/companies/intel/,0.091029,0.173708
126,Taiwan Semiconductor,161.734,29.392,10.364,58.533,127,Information Technology,Semiconductors,Asia,Taiwan,Taiwan,,Te Yin Liu,http://www.forbes.com/companies/taiwan-semicon...,0.177063,0.352613
169,Qualcomm,83.187,23.778,4.889,52.366,170,Information Technology,Semiconductors,North America,United States,California,California,Steven M. Mollenkopf,http://www.forbes.com/companies/qualcomm/,0.093362,0.20561
348,SK Hynix,31.958,14.823,2.5459,26.673,349,Information Technology,Semiconductors,Asia,South Korea,South Korea,,Sung-Wook Park,http://www.forbes.com/companies/sk-hynix/,0.095449,0.171753
386,Texas Instruments,80.501,13.37,3.417,16.431,387,Information Technology,Semiconductors,North America,United States,Texas,Texas,Richard K. Templeton,http://www.forbes.com/companies/texas-instrume...,0.207961,0.255572
481,Micron Technology,31.552,14.733,0.689,32.355,482,Information Technology,Semiconductors,North America,United States,Idaho,Idaho,Sanjay Mehrotra,http://www.forbes.com/companies/micron-technol...,0.021295,0.046766
498,Applied Materials,42.038,11.846,2.138,15.244,499,Information Technology,Semiconductors,North America,United States,California,California,Gary E. Dickerson,http://www.forbes.com/companies/applied-materi...,0.140252,0.180483
530,ASML Holding,56.8,7.516,1.7223,19.742,531,Information Technology,Semiconductors,Europe,Netherlands,Netherlands,,Peter T. F. M. Wennink,http://www.forbes.com/companies/asml-holding/,0.08724,0.229151
591,Broadcom,87.597,15.608,-1.877,49.617,592,Information Technology,Semiconductors,Asia,Singapore,Singapore,,Hock E. Tan,http://www.forbes.com/companies/broadcom/,-0.03783,-0.120259


## Forbes list  
- 同Industry：1
- 同Sector：2
- 有出現在company list：3
- Company list上有但Forbes沒有：4

In [20]:
forb_ls =[]

for i in com_ls:
    if (f_data["Company"]==i).any():
        if (f_data[f_data.Company==i]["Industry"]=="Semiconductors").any():
            forb_ls.append(1)
        elif (f_data[f_data.Company==i]["Sector"]=="Information Technology").any():
            forb_ls.append(2)
        else:
            forb_ls.append(3)
    else:
        forb_ls.append(4)
    

In [21]:
forb_ls

[1, 4, 2, 4, 3, 4, 4, 4, 4, 4, 1, 4, 2, 4]

## Company list competitor

In [22]:
competitor_ls=[4, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

## 計算NMI

In [23]:
# (labels_true, labels_pred)
nmi = normalized_mutual_info_score(forb_ls,competitor_ls)
ari = adjusted_rand_score(forb_ls,competitor_ls)

print("nmi: ",nmi)
print("ari: ", ari)

nmi:  0.2774543584655694
ari:  0.19543973941368079


## 公司名稱和類別

In [24]:
competitor_df = pd.DataFrame((zip(com_ls, competitor_ls, forb_ls)), columns = ['company', '2017_competitor', '2017_forb'])

In [25]:
competitor_df

Unnamed: 0,company,2017_competitor,2017_forb
0,Intel,4,1
1,Intel IP,4,4
2,Apple,3,2
3,Robert Bosch GmbH,1,4
4,Sony,1,3
5,Siemens AG,1,4
6,Fraunhofer Gesellschaft zur Forderung der Ange...,1,4
7,Innogy Innovation GmbH,1,4
8,Osram GmbH,1,4
9,Messer Group GmbH,1,4


In [26]:
competitor_df.to_csv("../competitor_df/cpc_de_2017.csv", index=False)