In [1]:
import nltk
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
# download stopwords
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pjh37\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pjh37\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
def preprocess_text(file_path, output_file):
    # CSV 파일 읽기
    df = pd.read_csv(file_path)

    # 데이터 프레임의 첫 몇 줄 확인
    print(f"Processing {file_path}")
    print(df.head())

    # 전처리 함수
    def clean_text(text):
        # 텍스트가 문자열인지 확인하고 문자열이 아닌 경우 빈 문자열로 변환
        if not isinstance(text, str):
            text = ""

        # 소문자로 변환
        text = text.lower()
        # 구두점 및 숫자 제거
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        # 토큰화
        tokens = text.split()
        # 불용어 제거
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        # 표제어 추출
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)

    # 'abstract' 열에 전처리 적용
    if 'abstract' in df.columns:
        df['cleaned_abstract'] = df['abstract'].apply(clean_text)
    else:
        print(f"'abstract' 열이 {file_path}에 존재하지 않습니다.")

    # 전처리된 데이터프레임을 CSV 파일로 저장
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Processed data saved to {output_file}")
    
    # 전처리 완료된 데이터프레임 반환
    return df

In [9]:
df_2023 = preprocess_text('Data/wearable_devices_patents_2023.csv', 'Data/wearable_devices_processed_2023.csv')

Processing Data/wearable_devices_patents_2023.csv
  patent_number                                              title  \
0  US12039680B2  Method of rendering using a display device \n ...   
1  US12095833B2  System and method for augmented and virtual re...   
2  US11886261B2  Temperature management in wearable devices \n ...   
3  US12254980B2  Senior living engagement and care support plat...   
4  US12212168B2  Circuits and methods for wearable device charg...   

                                            abstract  date_filed  date_granted  
0  \n     One embodiment is directed to a system ...         NaN           NaN  
1  \n     One embodiment is directed to a system ...         NaN           NaN  
2  \n     Systems and methods for managing temper...         NaN           NaN  
3  \n     Provided herein is a digital care circl...         NaN           NaN  
4  \n     Methods and devices for wired charging ...         NaN           NaN  
Processed data saved to Data/wearable_devic