In [8]:
import pandas as pd
import re
import chardet
import jieba

# Load the CSV file into a pandas DataFrame with tab (\t) delimiter
df = pd.read_csv('dev.csv', sep='\t')

# Load stopwords from the text file with the detected encoding
with open('stopwords.txt', 'r') as stopwords_file:
    stopwords = stopwords_file.read().splitlines()

# Merge the "title", "desc", and "text" columns into a single column
df['merged'] = df['title'] + ' ' + df['desc']

# Tokenize using jieba
df['merged'] = df['merged'].apply(lambda x: ' '.join(jieba.cut(x)))

# Remove stopwords from the merged column
df['merged'] = df['merged'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

# Remove English letters from the merged column
df['merged'] = df['merged'].apply(lambda x: re.sub(r'[a-zA-Z]+', '', x))

# Data cleaning - remove numbers, letters, punctuation, and tab characters from merged column
df['merged'] = df['merged'].apply(lambda x: re.sub(r'[\W\d]+', ' ', x))

# Remove rows where merged column becomes empty after cleaning
df = df[df['merged'].str.strip().astype(bool)]

# Print the number of unique labels
label_counts = df['label'].value_counts()
print(label_counts)

# Write the merged data to a TXT file with UTF-8 encoding
with open('dev.txt', 'w', encoding='utf-8') as f:
    for row in df.itertuples(index=False):
        f.write('{}\t{}\n'.format(row.merged, row.label))


文学         13469
童书          5996
大中专教材教辅     5395
工业技术        3289
中小学教辅       2603
艺术          2396
社会科学        2316
小说          2191
计算机与互联网     2054
管理          1851
建筑          1788
外语学习        1493
历史          1455
科学与自然       1420
法律          1256
政治/军事       1209
哲学/宗教       1012
医学           998
经济           936
励志与成功        921
考试           869
传记           761
青春文学         746
 文化          707
农业/林业        567
动漫           442
育儿/家教        390
烹饪/美食        375
国学/古籍        357
旅游/地图        354
健身与保健        348
科普读物         329
孕产/胎教        301
金融与投资        186
婚恋与两性         63
Name: label, dtype: int64


In [22]:
cd ..

C:\Users\hfu4


In [11]:
cd C:\Users\hfu4\03-bert-TextCNN

C:\Users\hfu4\03-bert-TextCNN


In [15]:
!pip install --upgrade transformers

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting transformers
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/e8/b5/ddb16f9de207e6571ab7cc5db0cc538fa2d6d91cf024565496462af4c1ce/transformers-4.29.1-py3-none-any.whl (7.1 MB)
                                              0.0/7.1 MB ? eta -:--:--
                                              0.0/7.1 MB ? eta -:--:--
                                              0.0/7.1 MB ? eta -:--:--
                                              0.0/7.1 MB ? eta -:--:--
                                              0.0/7.1 MB ? eta -:--:--
                                              0.0/7.1 MB ? eta -:--:--
                                              0.0/7.1 MB ? eta -:--:--
                                              0.0/7.1 MB ? eta -:--:--
                                              0.0/7.1 MB ? eta -:--:--
                                              0.0/7.1 MB ? eta -:--:--
                                   

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
textclf 0.1.0 requires transformers==2.4.1, but you have transformers 4.29.1 which is incompatible.


^C


In [12]:
import chardet

def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']
        confidence = result['confidence']
        print(f"Detected encoding: {encoding} (confidence: {confidence})")

detect_encoding('stopwords.txt')

Detected encoding: GB2312 (confidence: 0.99)


Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting chardet
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/74/8f/8fc49109009e8d2169d94d72e6b1f4cd45c13d147ba7d6170fb41f22b08f/chardet-5.1.0-py3-none-any.whl (199 kB)
                                              0.0/199.1 kB ? eta -:--:--
     ------                                   30.7/199.1 kB ? eta -:--:--
     ------                                   30.7/199.1 kB ? eta -:--:--
     -------                               41.0/199.1 kB 279.3 kB/s eta 0:00:01
     -------                               41.0/199.1 kB 279.3 kB/s eta 0:00:01
     -------                               41.0/199.1 kB 279.3 kB/s eta 0:00:01
     -------                               41.0/199.1 kB 279.3 kB/s eta 0:00:01
     -------                               41.0/199.1 kB 279.3 kB/s eta 0:00:01
     -----------                           61.4/199.1 kB 136.5 kB/s eta 0:00:02
     -----------                           61.4/199.1 k