In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from scipy.special import rel_entr
from scipy.stats import entropy
import scipy.stats
import math
import warnings 
warnings.filterwarnings("ignore") 

from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score

In [2]:
# 使用GPU
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
! nvidia-smi

Wed Jun 29 01:33:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 32%   41C    P8    21W / 260W |     22MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:02:00.0 Off |                  N/A |
| 30%   38C    P8    24W / 260W |      8MiB / 11019MiB |      0%      Default |
|       

# MongoDB

In [4]:
from pymongo import MongoClient
import pymongo

In [5]:
mongoURI = "mongodb://%s:%s@%s/%s?authMechanism=SCRAM-SHA-1" % ("eva", "eva_30241", "140.117.69.70:30241", "eva")

try:
    conn = pymongo.MongoClient(mongoURI)
    db = conn.eva
    # db_de = db.patent_de
    # db_us = db.patent_us
    # db_cn = db.patent_cn

    db_rm_cn_2015 = db.rm_patent_cn_2015
    db_rm_us_2015 = db.rm_patent_us_2015
    db_rm_de_2015 = db.rm_patent_de_2015

    
except errors.ConnectionFailure as err:
    print(err)

# 2015

## Company數量

In [6]:
data_assignee = db_rm_de_2015.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])

In [7]:
df_family_de = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [8]:
data_assignee = db_rm_cn_2015.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_cn = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [9]:
data_assignee = db_rm_us_2015.aggregate([
    {"$unwind" : "$current_assignee"},
    {"$group" : {"_id" : "$current_assignee", "count":{"$sum":1}}},
    {"$sort": {"count":-1}},
    {"$limit":30}
])

ls_company = []
ls_count = []
for data in data_assignee:
    # print(data["_id"],data["count"])
    ls_company.append(data["_id"])
    ls_count.append(data["count"])
    
df_family_us = pd.DataFrame((zip(ls_company, ls_count)), columns = ['company', 'count'])

In [10]:
# final_df = pd.concat([df_family_de,df_family_cn,df_family_us],axis=0)

In [11]:
# 取總數前15家公司
df_family_cn.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)

Unnamed: 0,company,count
0,Chengdu Qinchuan IoT Technology Co Ltd,82
19,SUZHOU XUANHE IOT TECHNOLOGY Co Ltd,81
10,Jiangsu IoT Research and Development Center,61
13,Nanjing Hansense Iot Technology Co Ltd,26
14,Nanjing IoT Sensor Technology Co Ltd,25
4,GUANGDONG ABEJ IOT INTELLIGENT TECHNOLOGY Co Ltd,24
7,Huawei Technologies Co Ltd,19
20,Samsung Electronics Co Ltd,17
17,SHENZHEN QIANHAI LINGJU INTERNET OF THINGS SCI...,16
22,Shenzhen Qianhai Livall IoT Technology Co Ltd,16


In [12]:
com_df = df_family_cn.groupby("company").sum().reset_index().sort_values("count",ascending=False).head(15)
com_df.reset_index(drop=True, inplace=True)

In [13]:
com_df["count_log"] = com_df["count"].apply(lambda x: np.log(x))
com_df

Unnamed: 0,company,count,count_log
0,Chengdu Qinchuan IoT Technology Co Ltd,82,4.406719
1,SUZHOU XUANHE IOT TECHNOLOGY Co Ltd,81,4.394449
2,Jiangsu IoT Research and Development Center,61,4.110874
3,Nanjing Hansense Iot Technology Co Ltd,26,3.258097
4,Nanjing IoT Sensor Technology Co Ltd,25,3.218876
5,GUANGDONG ABEJ IOT INTELLIGENT TECHNOLOGY Co Ltd,24,3.178054
6,Huawei Technologies Co Ltd,19,2.944439
7,Samsung Electronics Co Ltd,17,2.833213
8,SHENZHEN QIANHAI LINGJU INTERNET OF THINGS SCI...,16,2.772589
9,Shenzhen Qianhai Livall IoT Technology Co Ltd,16,2.772589


公司名稱正規化

In [14]:
com_df["company"] = com_df.company.apply(lambda x: x.replace(" Inc","")\
                                                    .replace(" Co Ltd","")\
                                                    .replace(" Co. Ltd.","")\
                                                    .replace(" Co., Ltd.","")\
                                                    .replace(" Ltd","")\
                                                    .replace(" Corp","")\
                                                    .replace("..","")\
                                                    .replace("International Business Machines","IBM")\
                                                    .replace("Nippon Telegraph and Telephone","Nippon Telegraph & Tel")\
                                                    .replace("Alibaba Group Holding Ltd","Alibaba")\
                                                    .replace("ZTE Intelligent IoT Technology","ZTE")\
                                                    .replace("AT&T Intellectual Property I LP","AT&T")\
                                                    .replace("Microsoft Technology Licensing LLC","Microsoft")\
                                                    .replace("Telefonaktiebolaget LM Ericsson AB","Ericsson")\
                                                    .replace("Cisco Technology","Cisco Systems")\
                                                    .replace("Verizon Patent and Licensing","Verizon Communications")\
                                                    .replace("Tencent Technology Shenzhen","Tencent Holdings")\
                                                    .replace("China Mobile Communications Group","China Mobile")\
                                                    .replace("Alipay Hangzhou Information Technology","Alibaba Group")\
                                                    .replace("Beijing Xiaomi Mobile Software","Xiaomi"))

In [15]:
com_df

Unnamed: 0,company,count,count_log
0,Chengdu Qinchuan IoT Technology,82,4.406719
1,SUZHOU XUANHE IOT TECHNOLOGY,81,4.394449
2,Jiangsu IoT Research and Development Center,61,4.110874
3,Nanjing Hansense Iot Technology,26,3.258097
4,Nanjing IoT Sensor Technology,25,3.218876
5,GUANGDONG ABEJ IOT INTELLIGENT TECHNOLOGY,24,3.178054
6,Huawei Technologies,19,2.944439
7,Samsung Electronics,17,2.833213
8,SHENZHEN QIANHAI LINGJU INTERNET OF THINGS SCI...,16,2.772589
9,Shenzhen Qianhai Livall IoT Technology,16,2.772589


In [16]:
com_ls = list(com_df.company)

In [17]:
com_ls.remove('Samsung Electronics')

# Data 2015

富比士2000資料

In [18]:
f_data = pd.read_csv("../../Forbes/Forbes_2015.csv")

In [19]:
f_data[f_data["Industry"]=="Semiconductors"]

Unnamed: 0,Company,Market Value,Revenue,Profits,Assets,Rank,Sector,Industry,Continent,Country,Headquarters,State,CEO,Forbes Webpage,Profits as % of Assets,Profits as % of Revenue
17,Samsung Electronics,199.356,195.891,21.9278,209.637,18,Information Technology,Semiconductors,Asia,South Korea,South Korea,,Oh-Hyun Kwon,http://www.forbes.com/companies/samsung-electr...,0.104599,0.111939
66,Intel,147.197,55.87,11.704,91.956,67,Information Technology,Semiconductors,North America,United States,California,California,Brian Krzanich,http://www.forbes.com/companies/intel/,0.127278,0.209486
155,Qualcomm,111.774,26.964,8.063,48.447,156,Information Technology,Semiconductors,North America,United States,California,California,Steven Mollenkopf,http://www.forbes.com/companies/qualcomm/,0.166429,0.299028
157,Taiwan Semiconductor,123.323,25.173,8.7086,47.311,158,Information Technology,Semiconductors,Asia,Taiwan,Taiwan,,Te Liu,http://www.forbes.com/companies/taiwan-semicon...,0.184071,0.34595
353,SK Hynix,29.994,16.269,3.9856,24.458,354,Information Technology,Semiconductors,Asia,South Korea,South Korea,,Seong-Wook Park,http://www.forbes.com/companies/sk-hynix/,0.162957,0.244981
354,Micron Technology,29.551,16.948,3.893,23.818,355,Information Technology,Semiconductors,North America,United States,Idaho,Idaho,D Durcan,http://www.forbes.com/companies/micron-technol...,0.163448,0.229703
417,Texas Instruments,59.488,13.045,2.778,17.722,418,Information Technology,Semiconductors,North America,United States,Texas,Texas,Richard Templeton,http://www.forbes.com/companies/texas-instrume...,0.156754,0.212955
592,ASML Holding,44.83,7.768,1.5897,15.812,593,Information Technology,Semiconductors,Europe,Netherlands,Netherlands,,Peter Wennink,http://www.forbes.com/companies/asml-holding/,0.100538,0.204647
681,Applied Materials,27.228,9.241,1.167,13.073,682,Information Technology,Semiconductors,North America,United States,California,California,Gary Dickerson,http://www.forbes.com/companies/applied-materi...,0.089268,0.126285
787,Mediatek,20.947,7.031,1.5311,11.111,788,Information Technology,Semiconductors,Asia,Taiwan,Taiwan,,Ming Tsai,http://www.forbes.com/companies/mediatek/,0.1378,0.217764


## Forbes list  
- 同Industry：1
- 同Sector：2
- 有出現在company list：3
- Company list上有但Forbes沒有：4

In [20]:
forb_ls =[]

for i in com_ls:
    if (f_data["Company"]==i).any():
        if (f_data[f_data.Company==i]["Industry"]=="Semiconductors").any():
            forb_ls.append(1)
        elif (f_data[f_data.Company==i]["Sector"]=="Information Technology").any():
            forb_ls.append(2)
        else:
            forb_ls.append(3)
    else:
        forb_ls.append(4)
    

In [21]:
forb_ls

[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]

## Company list competitor

In [22]:
competitor_ls=[4, 3, 4, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2]

## 計算NMI

In [23]:
# (labels_true, labels_pred)
nmi = normalized_mutual_info_score(forb_ls,competitor_ls)
ari = adjusted_rand_score(forb_ls,competitor_ls)

print("nmi: ",nmi)
print("ari: ", ari)

nmi:  0.0
ari:  0.0


## 公司名稱和類別

In [24]:
competitor_df = pd.DataFrame((zip(com_ls, competitor_ls, forb_ls)), columns = ['company', '2015_competitor', '2015_forb'])

In [25]:
competitor_df

Unnamed: 0,company,2015_competitor,2015_forb
0,Chengdu Qinchuan IoT Technology,4,4
1,SUZHOU XUANHE IOT TECHNOLOGY,3,4
2,Jiangsu IoT Research and Development Center,4,4
3,Nanjing Hansense Iot Technology,1,4
4,Nanjing IoT Sensor Technology,1,4
5,GUANGDONG ABEJ IOT INTELLIGENT TECHNOLOGY,1,4
6,Huawei Technologies,2,4
7,SHENZHEN QIANHAI LINGJU INTERNET OF THINGS SCI...,1,4
8,Shenzhen Qianhai Livall IoT Technology,1,4
9,Tianjin Hui Zhiwulian Science And Technology,1,4


In [26]:
competitor_df.to_csv("../competitor_df/cpc_cn_2015.csv", index=False)