In [1]:
# 从一个文件夹中，自动读取txt文件，然后计算情感二分类
import os
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
import logging

def load_files_from_directory(directory_path):
    texts = []
    file_names = []
    encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            for encoding in encodings:
                try:
                    with open(file_path, 'r', encoding=encoding) as file:
                        texts.append(file.read())
                        file_names.append(filename)
                        break
                except UnicodeDecodeError:
                    continue
            else:
                print(f"Could not decode file {filename} with any of the tried encodings.")
    return texts, file_names

# Directory containing txt files
directory_path = '/data1/dxw_data/llm/RA/park/main_description_full'

# Load data
texts, file_names = load_files_from_directory(directory_path)

# Load pre-trained sentiment analysis pipeline
model_name = '/data1/dxw_data/llm/RA/park/sentiment-roberta-large-english'
sentiment_analyzer = pipeline('sentiment-analysis', model=model_name)

# Start index
start_index = 0  # Change this to the desired start index

# Configure logging
logging.basicConfig(filename='/data1/dxw_data/llm/RA/park/error_log.txt', level=logging.DEBUG)

# Predict the sentiment of the texts from the directory
predictions = []
batch_size = 2000

for idx in tqdm(range(start_index, len(texts)), desc="Processing texts", initial=start_index, total=len(texts)):
    try:
        # Pass the text directly to the sentiment analyzer
        prediction = sentiment_analyzer(texts[idx][:512])[0]
        predictions.append(prediction)
        
        # Save results every batch_size iterations
        if (idx + 1) % batch_size == 0 or idx + 1 == len(texts):
            predicted_labels = [pred['label'].lower() for pred in predictions]
            results_df = pd.DataFrame({'file_name': file_names[start_index: start_index + len(predictions)], 'sentiment': predicted_labels})
            output_excel_path = f'/data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_{start_index}_{start_index + len(predictions)}.xlsx'
            results_df.to_excel(output_excel_path, index=False)
            print(f"Sentiment analysis results have been saved to {output_excel_path}")
            start_index += batch_size
            predictions = []
    except Exception as e:
        logging.error(f"Error processing index {idx}: {str(e)}")
        logging.error(f"Text: {texts[idx][:512]}")
        print(f"Error processing index {idx}: {str(e)}")


2024-07-01 08:08:29.233962: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-01 08:08:29.373296: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-01 08:08:29.949143: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64
2024-07-01 08:08:29.949207: W tensorflow/compiler/xla/stream_exec

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_18000_20000.xlsx


Processing texts:  30%|███       | 20609/67891 [05:54<1:57:14,  6.72it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors
Processing texts:  30%|███       | 20611/67891 [05:54<1:27:36,  8.99it/s]

Error processing index 20609: The expanded size of the tensor (594) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 594].  Tensor sizes: [1, 514]


Processing texts:  32%|███▏      | 22001/67891 [09:01<1:49:22,  6.99it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_20000_21999.xlsx


Processing texts:  35%|███▌      | 24001/67891 [13:30<1:32:31,  7.91it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_22000_24000.xlsx


Processing texts:  38%|███▊      | 26001/67891 [18:08<1:38:45,  7.07it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_24000_26000.xlsx


Processing texts:  41%|████      | 28001/67891 [22:42<2:28:05,  4.49it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_26000_28000.xlsx


Processing texts:  43%|████▎     | 29155/67891 [25:22<1:16:20,  8.46it/s]

Error processing index 29153: The expanded size of the tensor (1014) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 1014].  Tensor sizes: [1, 514]


Processing texts:  44%|████▍     | 30001/67891 [27:18<1:24:45,  7.45it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_28000_29999.xlsx


Processing texts:  47%|████▋     | 32001/67891 [31:47<1:24:53,  7.05it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_30000_32000.xlsx


Processing texts:  50%|█████     | 34001/67891 [36:22<1:20:13,  7.04it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_32000_34000.xlsx


Processing texts:  53%|█████▎    | 36001/67891 [40:59<1:30:45,  5.86it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_34000_36000.xlsx


Processing texts:  56%|█████▌    | 38001/67891 [45:36<1:10:46,  7.04it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_36000_38000.xlsx


Processing texts:  59%|█████▉    | 40001/67891 [50:14<1:13:55,  6.29it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_38000_40000.xlsx


Processing texts:  62%|██████▏   | 42001/67891 [55:08<1:15:46,  5.69it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_40000_42000.xlsx


Processing texts:  65%|██████▍   | 44001/67891 [1:00:05<1:02:49,  6.34it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_42000_44000.xlsx


Processing texts:  68%|██████▊   | 46001/67891 [1:05:01<1:03:42,  5.73it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_44000_46000.xlsx


Processing texts:  71%|███████   | 48001/67891 [1:10:00<53:21,  6.21it/s]  

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_46000_48000.xlsx


Processing texts:  74%|███████▎  | 50001/67891 [1:15:06<51:27,  5.80it/s]  

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_48000_50000.xlsx


Processing texts:  77%|███████▋  | 52001/67891 [1:20:02<36:16,  7.30it/s]  

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_50000_52000.xlsx


Processing texts:  80%|███████▉  | 54001/67891 [1:24:55<37:32,  6.17it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_52000_54000.xlsx


Processing texts:  82%|████████▏ | 56001/67891 [1:29:47<27:16,  7.26it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_54000_56000.xlsx


Processing texts:  85%|████████▌ | 58001/67891 [1:34:40<24:57,  6.60it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_56000_58000.xlsx


Processing texts:  88%|████████▊ | 60001/67891 [1:39:35<21:34,  6.10it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_58000_60000.xlsx


Processing texts:  91%|█████████▏| 62001/67891 [1:44:30<14:30,  6.77it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_60000_62000.xlsx


Processing texts:  94%|█████████▍| 64001/67891 [1:49:23<10:23,  6.24it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_62000_64000.xlsx


Processing texts:  97%|█████████▋| 66001/67891 [1:54:16<04:42,  6.68it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_64000_66000.xlsx


Processing texts:  97%|█████████▋| 66170/67891 [1:54:40<03:09,  9.10it/s]

Error processing index 66168: The expanded size of the tensor (718) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 718].  Tensor sizes: [1, 514]


Processing texts: 100%|██████████| 67891/67891 [1:58:52<00:00,  6.99it/s]

Sentiment analysis results have been saved to /data1/dxw_data/llm/RA/park/output/sentiment_analysis_results_66000_67890.xlsx



