In [None]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

# 读取文件夹中的所有txt文件
path = "F:/Paper_goal/MKT_research/RA/PKU-park/dropbox/Crowdfunding/main_description_new_rerun_20110920_20130920/main_description_full"
file_list = [os.path.join(path, file) for file in os.listdir(path) if file.endswith('.txt')]

data = []

# 创建进度条
for file in tqdm(file_list, desc="读取文件"):
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read().replace('\n', ' ')
        data.append({'Tweet': text, 'FileName': os.path.basename(file)})

study3 = pd.DataFrame(data)

# 读取并清理Brysbaert具体性分数
brysbaert = pd.read_csv('F:/Paper_goal/MKT_research/RA/PKU-park/dropbox/LIWC/R_Functions/brysbaert.txt', sep='\t')
brysbaert['Conc.M'] = pd.to_numeric(brysbaert['Conc.M'], errors='coerce')
brysbaert = brysbaert.dropna(subset=['Conc.M'])

# 计算BCI值的函数
def brysbaert_calculator(texts, keep):
    scores = []
    for text in tqdm(texts, desc="计算BCI值"):
        words = text.lower().split()
        filtered_words = [word for word in words if word in keep]
        bryscore = sum(brysbaert.loc[brysbaert['Word'].isin(filtered_words), 'Conc.M'])
        word_count = len(filtered_words)
        if word_count > 0:
            bryscore /= word_count
        scores.append(bryscore)
    return scores

# 应用到study3数据集
study3['bryscore'] = brysbaert_calculator(study3['Tweet'], brysbaert['Word'].tolist())
study3['bryscorer'] = 6 - study3['bryscore']

# 标准化bryscorer
scaler = StandardScaler()
study3['zbryscorer'] = scaler.fit_transform(study3[['bryscorer']])

# 汇总结果到JSON文件，包含文件名和BCI分数
results = defaultdict(dict)
for _, row in study3.iterrows():
    results[row['FileName']] = {'bryscorer': row['bryscorer']}

with open("F:/Paper_goal/MKT_research/RA/PKU-park/dropbox/LIWC/Study 3/results_all.json", 'w') as json_file:
    json.dump(results, json_file, ensure_ascii=False, indent=4)
