In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import accelerate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
prompt = "Give me a short introduction to large language model."

In [3]:
def run_qwen_prompt(prompt):
    messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )


    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

In [12]:
run_qwen_prompt(prompt)

'A large language model is an artificial intelligence (AI) system that can generate human-like text in response to prompts or questions. These models are designed to understand and process natural language input, allowing them to answer questions, provide information, generate creative writing, or perform other tasks requiring human-like communication.\n\nThe term "large" refers to the size of the model\'s parameters—typically measured in billions or trillions of parameters—and the computational resources required to train such models. Large language models use advanced algorithms and techniques, including neural networks, to analyze vast amounts of data and learn patterns, enabling them to make accurate predictions and generate coherent responses based on their training data.\n\nThese models have been applied across various domains, from language translation and chatbots to virtual assistants and content creation tools. They continue to evolve rapidly as researchers develop new method

In [4]:
import csv

class DataFile:
    def __init__(self, file_path):
        self.file_path = file_path
        self.file = open(file_path, mode='r')
        self.csv_reader = csv.reader(self.file)
        self.header = next(self.csv_reader)
        self.make_int = ['overall_rating', 'volleys', 'free_kick_accuracy', 'weight']
        if all([m in self.header] for m in self.make_int):
            self.players=True
            self.change_inds = [i for i in range(len(self.header)) if self.header[i] in self.make_int]
        else:
            self.players = False
    
    def get_next_row(self):
        try:
            row = next(self.csv_reader)
            if self.players:
                for i in self.change_inds:
                    t = row[i]
                    row[i] = str(int(float(t)))
            return row
        except StopIteration:
            return None
    
    def get_header(self):
        return self.header
    
    def reset(self):
        self.file.close()
        self.file = open(self.file_path, mode='r')
        self.csv_reader = csv.reader(self.file)
        self.header = next(self.csv_reader)

    def __del__(self):
        self.file.close()

class Dataset:
    def __init__(self, dirty_file_path, clean_file_path) -> None:
        self.dirty = DataFile(dirty_file_path)
        self.clean = DataFile(clean_file_path)

    def get_header(self):
        return self.dirty.get_header()

    def get_next_row(self):
        dirty_row = self.dirty.get_next_row()
        clean_row = self.clean.get_next_row()

        if dirty_row is None or clean_row is None:
            return None
        
        error_headers = [h_col for d_col, c_col, h_col in zip(dirty_row, clean_row, self.dirty.get_header()) if d_col != c_col]

        return (dirty_row, clean_row, error_headers)
    

    def reset(self):
        self.dirty.reset()
        self.clean.reset()

In [8]:
ms = Dataset(dirty_file_path='new_datasets/players_missing.csv', clean_file_path='new_datasets/players_clean.csv')

#header = ms.get_header()

#row = ms.get_next_row()

In [None]:
class Detector:
    def __init__(self, run_fn, model_type='qwen2.5-72B-instruct', detection_type="column"):
        self.run_fn = run_fn
        self.model_type = model_type
        self.detection_type = detection_type

    def _generate_prompt(self, row, header, dataset_type):
        formated_header = '|'.join(header)
        formatted_row = '|'.join(row)

        prompt = f"I have tabular data with the following columns {formated_header}. " 
        if self.detection_type=='metadata':
            prompt += self._get_metadata(dataset_type)
        prompt += f"Detect if there are any errors, typos, or missing entries in the following row: "
        prompt += f"{formatted_row}. Just report columns with errors." 

        return prompt 
    
    def _get_metadata(self, dataset):
        if dataset == 'players':
            return self._get_players_metadata()
    
    def _get_players_metadata(self):
        prompt = "The overall_rating column is an integer between 0 and 100."
        prompt += "The preferred_foot column is either right or left."
        prompt += "The attacking_work_rate column is one of the following: low, medium, high."
        prompt += "The volleys column is an integer between 0 and 100."
        prompt += "The free_kick_accuracy column is an integer between 0 and 100."
        prompt += "The player_name column is a value like Aaron Hunt or Stephane M'Bia or Sung-Yeung Ki or Suso or Tulio de Melo or Victor Hugo Montano"
        prompt += "The height column is a float value between 120.0 and 250.0."
        prompt += "The weight column is an integer value between 100 and 300."
        prompt += "The country column is one of the following values: England, France, Germany, Italy, Spain."
        prompt += "The league column is one of the following values: England Premier League, France Ligue 1, Germany 1. Bundesliga, Italy Serie A, kstraklasa,  Spain LIGA BBVA."
        prompt += "If the league column is 'England Premier League' then the country must be 'England'."
        prompt += "If the league column is 'France Ligue 1' then the country must be 'France'."
        prompt += "If the league column is 'Germany 1. Bundesliga' then the country must be 'Germany'."
        prompt += "If the league column is 'Italy Serie A' then the country must be 'Italy'."
        prompt += "If the league column is 'Spain LIGA BBVA' then the country must be 'Spain'."
        prompt += "Only discuss columns with errors. Do NOT mention correct columns. "
        return prompt
    
    def _get_qwen_split_phrases(self):
        return []
    
    def get_columns_mentioned_in_response(self, res, header):
        header = [col.lower() for col in header]

        cols = [col for col in header if f"'{col}'" in res or f"\"{col}\"" in res or f" {col} " in res or f"*{col}*" in res or f"\n{col} " in res]

        if 'row_id' in cols:
            cols.remove('row_id')

        return cols
    
    def help_get_response(self, row, header, dataset_type="players", printres=True):
        prompt = self._generate_prompt(row, header, dataset_type)

        if 'qwen' in self.model_type:
            split_phrases = self._get_qwen_split_phrases()
        else:
            split_phrases = []
        
        res= self.run_fn(prompt)
        if printres:
            print(res)

        for phrase in split_phrases:
            split_side = 0
            if type(phrase) is tuple:
                split_side = phrase[1]
                phrase = phrase[0]
        
            if phrase in res:
                res = res.split(phrase)[split_side]

        errors = self.get_columns_mentioned_in_response(res, header)
        if printres:
            print(errors)
        return errors

In [32]:
ms.reset()

In [11]:
detector_col = Detector(run_qwen_prompt, model_type='qwen2.5-1.5B-Instruct', detection_type='column')

#errorsc = detector_col.help_get_response(ms.get_next_row()[0], ms.get_header(), dataset_type="players")

#detector_md = Detector(run_qwen_prompt, model_type='qwen2.5-1.5B-Instruct', detection_type='metadata')
#errorsm = detector_md.help_get_response(ms.get_next_row()[0], ms.get_header(), dataset_type="players")

In [10]:
detector_md = Detector(run_qwen_prompt, model_type='qwen2.5-1.5B-Instruct', detection_type='metadata')
#row = ms.get_next_row()
#errorsm = detector_md.help_get_response(row[0], ms.get_header(), dataset_type="players")

[]

In [None]:
import os
class Evaluate:

    def __init__(self, dataset) -> None:
        self.dataset = dataset
        self.header = dataset.get_header()
    
    # precision: correct error detections / total number of error detections
    # recall: correct error detections / total number of errors
    # f1-score: 2 * (precions * recall) / (precision + recall)

    def eval(self, detector, limit=10000, mod=1):
        self.dataset.reset()

        is_fn = 'fn.csv' in self.dataset.dirty.file_path

        num_reported_errors = 0
        num_correct_reported_errors = 0
        num_errors = 0

        i = 0

        while i < limit:

            r = self.dataset.get_next_row()
            if r is None:
                break

            dirty_row, clean_row, error_columns = r 

            if error_columns != []:
                check = True

            num_errors += len(error_columns)

            if i % mod == 0:
                printres = True
            else:
                printres = False
            reported_error_columns = detector.help_get_response(dirty_row, self.header, printres)

            # if is_fn and 'relationship' in error_columns:
            #     if 'sex' in reported_error_columns:
            #         num_correct_reported_errors += 1
            #         num_errors += 1
            #     if 'relationship' in reported_error_columns:
            #         num_correct_reported_errors += 1
            # else:
            for col in reported_error_columns:
                if col in error_columns:
                    # correct error found!
                    num_correct_reported_errors += 1
            
            num_reported_errors += len(reported_error_columns)

            if i % 5 == 0:
                print(f"\t\t\t{i}")
                print(f"\t\t\t\t{num_errors, num_reported_errors, num_correct_reported_errors}")

            i += 1

        precision = 0
        if num_reported_errors != 0:
            precision = num_correct_reported_errors / num_reported_errors

        recall = 0
        if num_errors != 0:
            recall = num_correct_reported_errors / num_errors

        f1 = 0
        if precision + recall != 0:
            f1 = 2 * (precision * recall) / (precision + recall)

        scores = {'recall':recall, 'precision':precision, 'f1':f1, 'num_errors': num_errors, 'num_reported_errors': num_reported_errors, 'num_correct_reported_errors':num_correct_reported_errors}

        return (num_errors, num_reported_errors, num_correct_reported_errors, scores)
        

In [18]:
runner = Evaluate(ms)
output = runner.eval(detector_col)

There is an error in the "preferred_foot" column of the provided row. The correct value should be either "left" or "right", but it has been entered as "right".
['preferred_foot']
			0
				(0, 1, 0)
The error in the provided row is:

- **preferred_foot**: The value "right" should be corrected to either "left" or "both", as it's not specified which foot he prefers.

Here’s the corrected version of the row:
```
89|70|right|medium|33|25|Aaron Hughes|182.88|154|England|England Premier League
```
['preferred_foot']


KeyboardInterrupt: 