In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from scipy.special import rel_entr
from scipy.stats import entropy
import scipy.stats
import math
import warnings 
warnings.filterwarnings("ignore") 

from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score

In [2]:
# 使用GPU
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
! nvidia-smi

Tue Jun 28 16:48:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 27%   35C    P8    21W / 260W |     22MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 25%   33C    P8    24W / 260W |      8MiB / 11019MiB |      0%      Default |
|       

# MongoDB

In [4]:
from pymongo import MongoClient
import pymongo

In [5]:
mongoURI = "mongodb://%s:%s@%s/%s?authMechanism=SCRAM-SHA-1" % ("eva", "eva_30241", "140.117.69.70:30241", "eva")

try:
    conn = pymongo.MongoClient(mongoURI)
    db = conn.eva
    # db_de = db.patent_de
    # db_us = db.patent_us
    # db_cn = db.patent_cn

    db_rm_cn_2012 = db.rm_patent_cn_2012
    db_rm_us_2012 = db.rm_patent_us_2012
    db_rm_de_2012 = db.rm_patent_de_2012

    
except errors.ConnectionFailure as err:
    print(err)

# 2012

## Company數量

In [6]:
data_assignee = db_rm_de_2012.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])

In [7]:
df_family_de = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [8]:
data_assignee = db_rm_cn_2012.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_cn = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [9]:
data_assignee = db_rm_us_2012.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_us = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [10]:
# final_df = pd.concat([df_family_de,df_family_cn,df_family_us],axis=0)

In [11]:
# 取總數前15家公司
df_family_us.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)

Unnamed: 0,company,count
2,Cisco Technology Inc,33
28,Xerox Corp,17
20,Qualcomm Inc,6
3,Electronics and Telecommunications Research In...,5
6,FutureWei Technologies Inc,5
18,Nokia Solutions and Networks Oy,5
15,LG Electronics Inc,4
12,International Business Machines Corp,4
22,Samsung Electronics Co Ltd,4
21,Roamware Inc,4


In [12]:
com_df = df_family_us.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)
com_df.reset_index(drop=True, inplace=True)

In [13]:
com_df["count_log"] = com_df["count"].apply(lambda x: np.log(x))
com_df

Unnamed: 0,company,count,count_log
0,Cisco Technology Inc,33,3.496508
1,Xerox Corp,17,2.833213
2,Qualcomm Inc,6,1.791759
3,Electronics and Telecommunications Research In...,5,1.609438
4,FutureWei Technologies Inc,5,1.609438
5,Nokia Solutions and Networks Oy,5,1.609438
6,LG Electronics Inc,4,1.386294
7,International Business Machines Corp,4,1.386294
8,Samsung Electronics Co Ltd,4,1.386294
9,Roamware Inc,4,1.386294


公司名稱正規化

In [14]:
com_df["company"] = com_df.company.apply(lambda x: x.replace(" Inc","")\
                                                    .replace(" Co Ltd","")\
                                                    .replace(" Co. Ltd.","")\
                                                    .replace(" Co., Ltd.","")\
                                                    .replace(" Ltd","")\
                                                    .replace(" Corp","")\
                                                    .replace("..","")\
                                                    .replace("International Business Machines","IBM")\
                                                    .replace("Nippon Telegraph and Telephone","Nippon Telegraph & Tel")\
                                                    .replace("Alibaba Group Holding Ltd","Alibaba")\
                                                    .replace("ZTE Intelligent IoT Technology","ZTE")\
                                                    .replace("AT&T Intellectual Property I LP","AT&T")\
                                                    .replace("Microsoft Technology Licensing LLC","Microsoft")\
                                                    .replace("Telefonaktiebolaget LM Ericsson AB","Ericsson")\
                                                    .replace("Cisco Technology","Cisco Systems")\
                                                    .replace("Verizon Patent and Licensing","Verizon Communications")\
                                                    .replace("Tencent Technology Shenzhen","Tencent Holdings")\
                                                    .replace("China Mobile Communications Group","China Mobile")\
                                                    .replace("Alipay Hangzhou Information Technology","Alibaba Group")\
                                                    .replace("Beijing Xiaomi Mobile Software","Xiaomi")\
                                                    .replace("Nokia Technologies Oy","Nokia")\
                                                    .replace("Amazon Technologies","Amazon.com")\
                                                    .replace("Fujifilm Business Innovation","Fujifilm Holdings"))

In [15]:
com_df

Unnamed: 0,company,count,count_log
0,Cisco Systems,33,3.496508
1,Xerox,17,2.833213
2,Qualcomm,6,1.791759
3,Electronics and Telecommunications Research In...,5,1.609438
4,FutureWei Technologies,5,1.609438
5,Nokia Solutions and Networks Oy,5,1.609438
6,LG Electronics,4,1.386294
7,IBM,4,1.386294
8,Samsung Electronics,4,1.386294
9,Roamware,4,1.386294


In [16]:
com_ls = list(com_df.company)

In [17]:
com_ls.remove('Samsung Electronics')

# Data 2012

富比士2000資料

In [18]:
f_data = pd.read_csv("../../patent/code/依年分/Forbes/Forbes_2012.csv")

In [19]:
f_data[f_data["Industry"]=="Semiconductors"]

Unnamed: 0,Company,Industry,Country,Market Value,Sales,Profits,Assets,Rank,Forbes Webpage,Profits as % of Assets,...,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20
25,Samsung Electronics,Semiconductors,South Korea,162.0,142.4,11.5,133.7,26,http://www.forbes.com/companies/samsung-electr...,0.086013,...,,,,,,,,,,
85,Intel,Semiconductors,United States,138.5,54.0,12.9,71.1,85,http://www.forbes.com/companies/intel/,0.181435,...,,,,,,,,,,
283,Taiwan Semiconductor,Semiconductors,Taiwan,71.1,14.1,4.4,25.3,284,http://www.forbes.com/companies/taiwan-semicon...,0.173913,...,,,,,,,,,,
359,Texas Instruments,Semiconductors,United States,37.8,13.7,2.2,20.5,360,http://www.forbes.com/companies/texas-instrume...,0.107317,...,,,,,,,,,,
583,Applied Materials,Semiconductors,United States,16.6,10.0,1.5,13.6,583,http://www.forbes.com/companies/applied-materi...,0.110294,...,,,,,,,,,,
689,ASML Holding,Semiconductors,Netherlands,20.0,7.3,1.9,9.4,690,http://www.forbes.com/companies/asml-holding/,0.202128,...,,,,,,,,,,
782,Broadcom,Semiconductors,United States,20.6,7.4,0.9,9.0,783,http://www.forbes.com/companies/broadcom/,0.1,...,,,,,,,,,,
867,Tokyo Electron,Semiconductors,Japan,10.3,8.0,0.9,9.5,868,http://www.forbes.com/companies/tokyo-electron/,0.094737,...,,,,,,,,,,
948,Hynix Semiconductor,Semiconductors,South Korea,17.8,9.4,-0.1,14.9,948,http://www.forbes.com/companies/hynix-semicond...,-0.006711,...,,,,,,,,,,
953,STMicroelectronics,Semiconductors,Switzerland,7.5,9.0,0.6,10.9,954,http://www.forbes.com/companies/stmicroelectro...,0.055046,...,,,,,,,,,,


## Forbes list  
- 同Industry：1
- 同Sector：2
- 有出現在company list：3
- Company list上有但Forbes沒有：4

In [20]:
forb_ls =[]

for i in com_ls:
    if (f_data["Company"]==i).any():
        if (f_data[f_data.Company==i]["Industry"]=="Semiconductors").any():
            forb_ls.append(1)
        # elif (f_data[f_data.Company==i]["Sector"]=="Information Technology").any():
        #     forb_ls.append(2)
        else:
            forb_ls.append(3)
    else:
        forb_ls.append(4)
    

In [21]:
forb_ls

[3, 3, 3, 4, 4, 4, 3, 3, 4, 4, 3, 4, 3, 4]

## Company list competitor

In [22]:
competitor_ls=[4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

## 計算NMI

In [23]:
# (labels_true, labels_pred)
nmi = normalized_mutual_info_score(forb_ls,competitor_ls)
ari = adjusted_rand_score(forb_ls,competitor_ls)

print("nmi: ",nmi)
print("ari: ", ari)

nmi:  0.18461773685187968
ari:  0.02287581699346405


## 公司名稱和類別

In [24]:
competitor_df = pd.DataFrame((zip(com_ls, competitor_ls, forb_ls)), columns = ['company', '2012_competitor_lda', '2012_forb'])

In [25]:
competitor_df

Unnamed: 0,company,2012_competitor_lda,2012_forb
0,Cisco Systems,4,3
1,Xerox,3,3
2,Qualcomm,1,3
3,Electronics and Telecommunications Research In...,1,4
4,FutureWei Technologies,1,4
5,Nokia Solutions and Networks Oy,1,4
6,LG Electronics,1,3
7,IBM,1,3
8,Roamware,1,4
9,WSOU Investments LLC,1,4


In [26]:
competitor_df.to_csv("../competitor_df/lda_us_2012.csv", index=False)