In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from scipy.special import rel_entr
from scipy.stats import entropy
import scipy.stats
import math
import warnings 
warnings.filterwarnings("ignore") 

from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score

In [2]:
# 使用GPU
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
! nvidia-smi

Mon Jun 13 00:20:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 26%   34C    P8    21W / 260W |     22MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 25%   33C    P8    24W / 260W |      8MiB / 11019MiB |      0%      Default |
|       

# MongoDB

In [4]:
from pymongo import MongoClient
import pymongo

In [5]:
mongoURI = "mongodb://%s:%s@%s/%s?authMechanism=SCRAM-SHA-1" % ("eva", "eva_30241", "140.117.69.70:30241", "eva")

try:
    conn = pymongo.MongoClient(mongoURI)
    db = conn.eva
    # db_de = db.patent_de
    # db_us = db.patent_us
    # db_cn = db.patent_cn

    db_rm_cn_2010 = db.rm_patent_cn_2010
    db_rm_us_2010 = db.rm_patent_us_2010
    db_rm_de_2010 = db.rm_patent_de_2010

    
except errors.ConnectionFailure as err:
    print(err)

# 2010

## Company數量

In [6]:
data_assignee = db_rm_de_2010.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])

In [7]:
df_family_de = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [8]:
data_assignee = db_rm_cn_2010.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_cn = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [9]:
data_assignee = db_rm_us_2010.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_us = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [10]:
final_df = pd.concat([df_family_de,df_family_cn,df_family_us],axis=0)

In [11]:
# 取總數前15家公司
final_df.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)

Unnamed: 0,company,count
43,ZTE Corp,13
44,ZTE Intelligent IoT Technology Co Ltd,11
40,Xerox Corp,9
11,Chengdu Qinchuan IoT Technology Co Ltd,7
10,Chengdu Monolithic Power Systems Co Ltd,5
22,Institute of Electrical Engineering of CAS,4
15,Datang Mobile Communications Equipment Co Ltd,4
45,Zhejiang University ZJU,3
17,Fujifilm Business Innovation Corp,3
16,Dezhou University,3


In [12]:
com_df = final_df.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(19)
com_df = com_df.append({"company":"Samsung Electronics Co Ltd","count":4}, ignore_index=True)
com_df.reset_index(drop=True, inplace=True)

In [13]:
com_ls = ['Beijing University of Technology',
 'Chengdu Monolithic Power Systems Co Ltd',
 'Datang Mobile Communications Equipment Co Ltd',
 'Fujifilm Business Innovation Corp',
 'Institute of Electrical Engineering of CAS',
 'Kunming University of Science and Technology',
 'University of Science and Technology Beijing USTB',
 'University of Shanghai for Science and Technology',
 'Western Superconducting Technologies Co Ltd',
 'Wuhan University WHU',
 'Xerox Corp',
 'Xiamen Xindeco IOT Technology Ltd.',
 'ZTE Corp',
 'ZTE Intelligent IoT Technology Co Ltd']

In [14]:
com_df = com_df[com_df.company.isin(com_ls)]

In [15]:
com_df["count_log"] = com_df["count"].apply(lambda x: np.log(x))
com_df.reset_index(drop=True, inplace=True)
com_df

Unnamed: 0,company,count,count_log
0,ZTE Corp,13,2.564949
1,ZTE Intelligent IoT Technology Co Ltd,11,2.397895
2,Xerox Corp,9,2.197225
3,Chengdu Monolithic Power Systems Co Ltd,5,1.609438
4,Institute of Electrical Engineering of CAS,4,1.386294
5,Datang Mobile Communications Equipment Co Ltd,4,1.386294
6,Fujifilm Business Innovation Corp,3,1.098612
7,University of Science and Technology Beijing USTB,3,1.098612
8,University of Shanghai for Science and Technology,3,1.098612
9,Beijing University of Technology,3,1.098612


公司名稱正規化

In [16]:
com_df["company"] = com_df.company.apply(lambda x: x.replace(" Inc","")\
                                                    .replace(" Co Ltd","")\
                                                    .replace(" Co. Ltd.","")\
                                                    .replace(" Co., Ltd.","")\
                                                    .replace(" Ltd","")\
                                                    .replace(" Corp","")\
                                                    .replace("..","")\
                                                    .replace("International Business Machines","IBM")\
                                                    .replace("Nippon Telegraph and Telephone","Nippon Telegraph & Tel")\
                                                    .replace("Alibaba Group Holding Ltd","Alibaba")\
                                                    .replace("ZTE Intelligent IoT Technology","ZTE")\
                                                    .replace("AT&T Intellectual Property I LP","AT&T")\
                                                    .replace("Microsoft Technology Licensing LLC","Microsoft")\
                                                    .replace("Telefonaktiebolaget LM Ericsson AB","Ericsson")\
                                                    .replace("Cisco Technology","Cisco Systems")\
                                                    .replace("Verizon Patent and Licensing","Verizon Communications")\
                                                    .replace("Nokia Technologies Oy","Nokia")\
                                                    .replace("China Mobile Communications Group","China Mobile")\
                                                    .replace("Nokia Solutions and Networks Oy","Nokia")\
                                                    .replace("Fujifilm Business Innovation","Fujifilm Holdings"))

In [17]:
com_df

Unnamed: 0,company,count,count_log
0,ZTE,13,2.564949
1,ZTE,11,2.397895
2,Xerox,9,2.197225
3,Chengdu Monolithic Power Systems,5,1.609438
4,Institute of Electrical Engineering of CAS,4,1.386294
5,Datang Mobile Communications Equipment,4,1.386294
6,Fujifilm Holdings,3,1.098612
7,University of Science and Technology Beijing USTB,3,1.098612
8,University of Shanghai for Science and Technology,3,1.098612
9,Beijing University of Technology,3,1.098612


In [18]:
com_ls = list(com_df.company)

In [20]:
# com_ls.remove('Samsung Electronics')

# Data 2010

富比士2000資料

In [21]:
f_data = pd.read_csv("./Forbes/Forbes_2010.csv")

In [22]:
f_data[f_data["Industry"]=="Semiconductors"]

Unnamed: 0,Company,Industry,Country,Market Value,Sales,Profits,Assets,Rank,Profits as % of Assets,Profits as % of Sales,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
54,Samsung Electronics,Semiconductors,South Korea,94.48,97.28,4.43,83.3,55,0.053181,0.045538651,,,,,,,,,
100,Intel,Semiconductors,United States,115.29,35.13,4.37,53.1,100,0.082298,0.124395104,,,,,,,,,
331,Taiwan Semiconductor,Semiconductors,Taiwan,48.22,8.65,2.7,18.05,332,0.149584,0.312138728,,,,,,,,,
416,Texas Instruments,Semiconductors,United States,30.59,10.43,1.47,12.12,417,0.121287,0.140939597,,,,,,,,,
1032,STMicroelectronics,Semiconductors,Switzerland,7.73,8.69,-1.16,13.34,1033,-0.086957,-0.133486766,,,,,,,,,
1042,Advanced Micro,Semiconductors,United States,5.52,5.4,0.38,9.08,1043,0.04185,0.07037037,,,,,,,,,
1067,Applied Materials,Semiconductors,United States,16.79,5.53,-0.09,10.0,1068,-0.009,-0.016274864,,,,,,,,,
1077,MediaTek,Semiconductors,Taiwan,17.64,2.75,0.58,3.01,1078,0.192691,0.210909091,,,,,,,,,
1088,Hynix Semiconductor,Semiconductors,South Korea,10.68,5.47,-3.79,12.8,1089,-0.296094,-0.692870201,,,,,,,,,
1216,Micron Technology,Semiconductors,United States,8.31,5.14,-0.93,11.73,1216,-0.079284,-0.180933852,,,,,,,,,


## Forbes list  
- 同Industry：1
- 同Sector：2
- 有出現在company list：3
- Company list上有但Forbes沒有：4

In [23]:
forb_ls =[]

for i in com_ls:
    if (f_data["Company"]==i).any():
        if (f_data[f_data.Company==i]["Industry"]=="Semiconductors").any():
            forb_ls.append(1)
        # elif (f_data[f_data.Company==i]["Sector"]=="Information Technology").any():
        #     forb_ls.append(2)
        else:
            forb_ls.append(3)
    else:
        forb_ls.append(4)
    

In [24]:
forb_ls

[3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4]

## Company list competitor

In [25]:
competitor_ls=[3, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2]

## 計算NMI

In [27]:
# (labels_true, labels_pred)
nmi = normalized_mutual_info_score(forb_ls,competitor_ls)
ari = adjusted_rand_score(forb_ls,competitor_ls)

print("nmi: ",nmi)
print("ari: ", ari)

nmi:  0.4598281430832654
ari:  0.37508879943168366
