In [25]:
!pip install gliner==0.1.12

Collecting urllib3<2.0.0,>=1.0.0 (from flair==0.13.1->gliner==0.1.12)
  Using cached urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)
Using cached urllib3-1.26.20-py2.py3-none-any.whl (144 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.5.0
    Uninstalling urllib3-2.5.0:
      Successfully uninstalled urllib3-2.5.0
Successfully installed urllib3-1.26.20


In [6]:
!pip install --upgrade urllib3

Collecting urllib3
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Using cached urllib3-2.5.0-py3-none-any.whl (129 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.26.20
    Uninstalling urllib3-1.26.20:
      Successfully uninstalled urllib3-1.26.20
Successfully installed urllib3-2.5.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
flair 0.13.1 requires urllib3<2.0.0,>=1.0.0, but you have urllib3 2.5.0 which is incompatible.


In [26]:
from gliner import GLiNER
from transformers import get_cosine_schedule_with_warmup

import pandas as pd
import torch
import os

In [27]:
model = GLiNER.from_pretrained("numind/NuZero_token")



## 1. Dimensions

In [28]:
# CSV 파일 읽기
df_2023 = pd.read_csv('Data/wearable_devices_processed_2023.csv', usecols=['cleaned_abstract'])

In [29]:
# 중복 토큰 제거
def remove_duplicates(text):
    words = text.split()
    unique_words = []
    seen = set()
    for word in words:
        if word not in seen:
            unique_words.append(word)
            seen.add(word)
    return ' '.join(unique_words)

In [30]:
# Apply the function to the cleaned_abstract column
df_2023['remove_duplicates_abstract'] = df_2023['cleaned_abstract'].apply(remove_duplicates)

# 초록 토큰 수 계산
df_2023['remove_duplicates_token'] = df_2023['remove_duplicates_abstract'].apply(lambda x: len(x.split()))

# 결과 출력
df_2023.head()

Unnamed: 0,cleaned_abstract,remove_duplicates_abstract,remove_duplicates_token
0,one embodiment directed system enabling two us...,one embodiment directed system enabling two us...,41
1,one embodiment directed system enabling two us...,one embodiment directed system enabling two us...,41
2,system method managing temperature wearable de...,system method managing temperature wearable de...,25
3,provided herein digital care circle platform e...,provided herein digital care circle platform e...,42
4,method device wired charging communication wea...,method device wired charging communication wea...,27


In [31]:
# 모든 행을 하나의 문자열로 결합
combined_text = ' '.join(df_2023['remove_duplicates_abstract'])

# 공백으로 토큰화하여 중복 제거
tokens = combined_text.split()
unique_tokens = set(tokens)  # set을 사용하여 중복을 제거
unique_tokens_list = list(unique_tokens)  # DataFrame에 저장하기 전에 리스트로 변환

# 중복이 제거된 토큰들을 공백으로 다시 연결
final_text = ' '.join(unique_tokens)

# 한 행당 384개의 토큰으로 저장(NuNER max token)
chunk_size = 384
rows = [unique_tokens_list[i:i + chunk_size] for i in range(0, len(unique_tokens_list), chunk_size)]

# DataFrame 생성
unique_df = pd.DataFrame({"tokens": rows})

# DataFrame 결과 확인
unique_df['tokens'].head()

0    [arena, prelearning, sea, ssb, rmsi, education...
1    [pfl, paneltouch, reauthenticating, pixellevel...
2    [straight, import, inktoner, warranty, draft, ...
3    [predecoded, timevarying, since, reestablishin...
4    [trailer, fall, hump, psf, keratinoytes, amorp...
Name: tokens, dtype: object

## 2. Values

In [32]:
# CSV 파일 읽기
file_path_2023_topic_8 = 'Data/wearable_devices_2023_lda_topic_8_with_labels.csv'
file_path_2023_topic_12 = 'Data/wearable_devices_2023_lda_topic_12_with_labels.csv'
file_path_2023_topic_16 = 'Data/wearable_devices_2023_lda_topic_16_with_labels.csv'

data_2023_topic_8 = pd.read_csv(file_path_2023_topic_8)
data_2023_topic_12 = pd.read_csv(file_path_2023_topic_12)
data_2023_topic_16 = pd.read_csv(file_path_2023_topic_16)

# 'label' 열만 선택하여 dimension 추출
label_2023_topic_8 = pd.DataFrame(data_2023_topic_8['label'])
label_2023_topic_12 = pd.DataFrame(data_2023_topic_12['label'])
label_2023_topic_16 = pd.DataFrame(data_2023_topic_16['label'])

# NuZero requires labels to be lower-cased
label_2023_topic_8 = label_2023_topic_8['label'].str.lower()
label_2023_topic_12 = label_2023_topic_12['label'].str.lower()
label_2023_topic_16 = label_2023_topic_16['label'].str.lower()

In [15]:
# 결과 확인
label_2023_topic_8

0           data processing
1    wireless communication
2            user interface
3              audio system
4        optical technology
5                 materials
6            network system
7                   sensors
Name: label, dtype: object

In [16]:
# 결과 확인
label_2023_topic_12

0            data processing
1     wireless communication
2               applications
3                    sensors
4             user interface
5           external devices
6             network system
7                    display
8               audio system
9         optical technology
10                 materials
11                    memory
Name: label, dtype: object

In [33]:
# 결과 확인
label_2023_topic_16

0             user interface
1      wireless signal setup
2                    circuit
3            medical devices
4               audio system
5                      power
6             network system
7                    display
8                     memory
9            data processing
10     display manufacturing
11             semiconductor
12                 emergency
13    wireless communication
14                   sensors
15          external devices
Name: label, dtype: object

### 1) Topic = 8

In [18]:
# 추출 결과를 저장할 리스트
results_2023_topic_8_entity = []

# 각 텍스트에 대해 엔티티 추출
for tokens in unique_df['tokens']:
    if isinstance(tokens, list):
        text = ' '.join(tokens)
    else:
        text = str(tokens)

    entities = model.predict_entities(text, label_2023_topic_8)

    for entity in entities:
        #print(entity["text"], "=>", entity["label"])
        results_2023_topic_8_entity.append({
            'Text': entity['text'],
            'Label': entity['label']
        })

# 결과 리스트를 데이터 프레임으로 변환
df_2023_topic_8_entity = pd.DataFrame(results_2023_topic_8_entity)

# Label별로 Text를 합치기
df_2023_topic_8_grouped = df_2023_topic_8_entity.groupby('Label')['Text'].apply(lambda x: ', '.join(x)).reset_index()

# 결과 데이터 프레임 출력
df_2023_topic_8_grouped.head()

Unnamed: 0,Label,Text
0,audio system,"earbuds, earbud, earphone, microphone, speaker..."
1,data processing,"bytestream, datastreaming, datahandling, pipel..."
2,materials,"iridium, hexylammonium, graphite, glassceramic..."
3,network system,ethernet
4,optical technology,"nearinfrared, waveguide, fiberoptic, lightscat..."


In [19]:
# 결과 저장
output_file_2023_topic_8 = 'Data/wearable_devices_2023_lda_topic_8_entities.csv'
df_2023_topic_8_grouped.to_csv(output_file_2023_topic_8, index=False)

### 2) Topic = 12

In [20]:
# 추출 결과를 저장할 리스트
results_2023_topic_12_entity = []

# 각 텍스트에 대해 엔티티 추출
for tokens in unique_df['tokens']:
    if isinstance(tokens, list):
        text = ' '.join(tokens)
    else:
        text = str(tokens)

    entities = model.predict_entities(text, label_2023_topic_12)

    for entity in entities:
        #print(entity["text"], "=>", entity["label"])
        results_2023_topic_12_entity.append({
            'Text': entity['text'],
            'Label': entity['label']
        })

# 결과 리스트를 데이터 프레임으로 변환
df_2023_topic_12_entity = pd.DataFrame(results_2023_topic_12_entity)

# Label별로 Text를 합치기
df_2023_topic_12_grouped = df_2023_topic_12_entity.groupby('Label')['Text'].apply(lambda x: ', '.join(x)).reset_index()

# 결과 데이터 프레임 출력
df_2023_topic_12_grouped.head()

Unnamed: 0,Label,Text
0,applications,watch
1,audio system,headphone
2,data processing,"bytestream, datastreaming, datahandling, beamf..."
3,display,"microdisplays, lcd, microdisplay"
4,external devices,smartwatches


In [21]:
# 결과 저장
output_file_2023_topic_12 = 'Data/wearable_devices_2023_lda_topic_12_entities.csv'
df_2023_topic_12_grouped.to_csv(output_file_2023_topic_12, index=False)

### 3) Topic = 16

In [34]:
# 추출 결과를 저장할 리스트
results_2023_topic_16_entity = []

# 각 텍스트에 대해 엔티티 추출
for tokens in unique_df['tokens']:
    if isinstance(tokens, list):
        text = ' '.join(tokens)
    else:
        text = str(tokens)

    entities = model.predict_entities(text, label_2023_topic_16)

    for entity in entities:
        #print(entity["text"], "=>", entity["label"])
        results_2023_topic_16_entity.append({
            'Text': entity['text'],
            'Label': entity['label']
        })

# 결과 리스트를 데이터 프레임으로 변환
df_2023_topic_16_entity = pd.DataFrame(results_2023_topic_16_entity)

# Label별로 Text를 합치기
df_2023_topic_16_grouped = df_2023_topic_16_entity.groupby('Label')['Text'].apply(lambda x: ', '.join(x)).reset_index()

# 결과 데이터 프레임 출력
df_2023_topic_16_grouped.head()

Unnamed: 0,Label,Text
0,audio system,headphone
1,circuit,"udi, circuit, capacitor, subcircuits, circuitry"
2,data processing,"bytestream, datastreaming, datahandling, beamf..."
3,display,lcd
4,medical devices,"inktoner, condom, cigarette, earbuds, oximeter..."


In [35]:
# 결과 저장
output_file_2023_topic_16 = 'Data/wearable_devices_2023_lda_topic_16_entities.csv'
df_2023_topic_16_grouped.to_csv(output_file_2023_topic_16, index=False)