In [22]:
from transformers import MarianMTModel, MarianTokenizer

# Load the Chinese-to-English model
model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_text(text):
    # Skip if it’s not a string or already English-like (e.g., dates, numbers)
    if not isinstance(text, str) or text.replace(".", "").isdigit():
        return text
    # Tokenize and translate
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)



In [25]:
translate_text("职务")

'Functions'

In [4]:
from rouge_score import rouge_scorer

def compute_rouge(reference_text, generated_text):
    """
    Compute ROUGE scores between two texts.
    
    Args:
        reference_text (str): The ground truth or reference text (e.g., dataset "answer").
        generated_text (str): The text to evaluate (e.g., model output).
    
    Returns:
        dict: ROUGE-1, ROUGE-2, and ROUGE-L scores with precision, recall, and f1.
    """
    # Initialize the scorer with ROUGE-1, ROUGE-2, and ROUGE-L
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Compute scores
    scores = scorer.score(reference_text, generated_text)
    
    # Format the results
    result = {
        "ROUGE-1": {
            "precision": scores["rouge1"].precision,
            "recall": scores["rouge1"].recall,
            "f1": scores["rouge1"].fmeasure
        },
        "ROUGE-2": {
            "precision": scores["rouge2"].precision,
            "recall": scores["rouge2"].recall,
            "f1": scores["rouge2"].fmeasure
        },
        "ROUGE-L": {
            "precision": scores["rougeL"].precision,
            "recall": scores["rougeL"].recall,
            "f1": scores["rougeL"].fmeasure
        }
    }
    
    return result

In [5]:
ref_text = "Technical capabilities: 6 points, explanation: Candidates participated in the development and improvement of software projects, showing certain technical capabilities, but may need to be further improved in solving complex technical problems independently. Project management capabilities: 5 points, explanation: assist in the completion of the technical environment construction of the project, but have limited experience in project planning and resource management. Technical support capabilities: 7 points, explanation: Responsible for technical research and technical support, showing good problem solving and customer service capabilities. Compliance with technical specifications: 8 points, explanation: performance in ensuring compliance with technical specifications in project development, showing good professional qualities. Technology platform construction ability: 6 points, explanation: assist in building technology platform according to needs, but may require more practice and learning in innovation and optimization of existing platforms."
gen_text = translate_text("技术能力：6分, 解释：候选人参与了软件项目的开发和改进，显示出一定的技术能力，但在独立解决复杂技术问题方面可能需要进一步提升。项目管理能力：5分, 解释：协助完成项目的技术环境构建，但在项目规划和资源管理方面的经验有限。技术支持能力：7分, 解释：负责技术调研和技术支持，表现出良好的问题解决和客户服务能力。遵守技术规范：8分, 解释：在保障项目开发中的技术规范遵守方面表现出色，显示出良好的专业素养。技术平台构建能力：6分, 解释：根据需求协助构筑技术平台，但在创新和优化现有平台方面可能需要更多的实践和学习。")

compute_rouge(ref_text, gen_text)

{'ROUGE-1': {'precision': 0.7913669064748201,
  'recall': 0.8396946564885496,
  'f1': 0.8148148148148148},
 'ROUGE-2': {'precision': 0.5507246376811594,
  'recall': 0.5846153846153846,
  'f1': 0.5671641791044777},
 'ROUGE-L': {'precision': 0.7553956834532374,
  'recall': 0.8015267175572519,
  'f1': 0.7777777777777777}}

In [6]:
import pandas as pd

df = pd.read_json("hf://datasets/pandalla/datatager_llm_resume_scoring/llm_resume_scoring.json", lines=True)
df

Unnamed: 0,question,answer
0,"{'工作时间': '2003.09-2011.08', '工作内容': '1、参与软件项目的...","技术能力：6分, 解释：候选人参与了软件项目的开发和改进，显示出一定的技术能力，但在独立解决..."
1,"{'工作时间': '2006.04-2013.04', '工作内容': '1、紫米品牌各省区...","销售能力：8分, 解释：候选人成功开发和维护了多个区域代理和直供客户，显示出较强的销售技巧和..."
2,"{'工作时间': '1990.10-2016.02', '工作内容': '1、负责线下校区地...","营销策划与执行能力：8分, 解释：候选人能够有效地策划和执行线下活动，显示出较强的市场推广能..."
3,"{'工作时间': '2005.06-2015.10', '工作内容': '1、带领销售团队达...","销售能力：6分, 解释：候选人能够带领团队达成销售业绩指标，但在销售策略和市场拓展方面可能需..."
4,"{'工作时间': '2000/01-2019/09', '工作内容': '1、通过多渠道添加...","沟通能力：8分, 解释：候选人能够有效地通过电话和面对面沟通与客户建立联系，表现出较强的沟通..."
...,...,...
1560,"{'工作时间': '2009.02-2017.10', '工作内容': '1、参与制订公司营...","市场分析与策略制定能力：8分, 解释：候选人能够有效地分析市场环境并制定相应的营销策略，但在..."
1561,"{'工作时间': '2009/01-2016/02', '工作内容': '1、负责公司官网及...","内容创作与编辑能力：9分, 解释：候选人展现出高水平的文案撰写和内容编辑能力，能够有效提升品..."
1562,"{'工作时间': '1995年11月-2012年04月', '工作内容': '1、接受公司在...",适应能力（接受区域和工作调整）: 评分依据：调整的灵活性和适应新环境的速度。分数：8/10 ...
1563,"{'工作时间': '1993/04-2013/06', '工作内容': '1.专注于为用户提...",技术能力（嵌入式软件驱动开发）: 此岗位要求深厚的技术背景和编程能力。评分依据：技术问题的解...


In [7]:
df['question'][0]

{'工作时间': '2003.09-2011.08',
 '工作内容': '1、参与软件项目的开发或者改进；2、负责相关的技术调研及技术支持；3、协助完成项目开发及管理需要的技术环境构建；4、保障项目开发中的技术规范遵守；5、根据需求协助构筑相关的技术平台；',
 '职务': '普工',
 '工作单位': '厦门塔斯曼生物工程有限公司'}

In [None]:
def translate_c1_record(record: dict):
    translated_c1_record = {}
    i = 0
    for key, value in record.items():
        if i != 1:
            translated_c1_record[translate_text(key)] = translate_text(value)
        else:
            
    return(translated_c1_record)

translate_c1_record(df['question'][0])

{'Working hours': '2003.09-2011.08',
 'Work content': 'To participate in the development or improvement of software projects; to be responsible for relevant technical research and technical support; to assist in the completion of the construction of the technological environment for project development and management needs; to guarantee technical compliance in project development; and to assist in the construction of relevant technology platforms, as required;',
 'Functions': 'Popular.',
 '_Other Organiser': 'Xiamen Tasman Biological Engineering Co. Ltd.'}