In [1]:
import pandas as pd
import spacy
import unidecode
import re
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from spacy.lang.pt.stop_words import STOP_WORDS
from typing import Dict, Any,List
import warnings


nlp = spacy.load("pt_core_news_lg")



warnings.filterwarnings("ignore")

warnings.filterwarnings("once", message="This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical")


warnings.filterwarnings("ignore", category=UserWarning, module="transformers.*")


warnings.filterwarnings("ignore", message="Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel")

warnings.filterwarnings("ignore", category=UserWarning, message="Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel")

#  palavras-chave SQL

# Carrega as palavras-chave para SQL
# Mapeamento de consulta SQL para palavras-chave
# Mapeamento de consulta SQL para tipos PySpark

sql_keywords: List[str] = [
    'SELECT',
    'FROM',
    'WHERE',
    'GROUP BY',
    'HAVING',
    
]

sql_to_pyspark_mapping: Dict[str, str] = {
    'select': 'df.select',
    'update': 'df.withColumn',
    'merge': 'deltaTablePeople.alias(\'people\').merge(dfUpdates.alias(\'updates\'), "people.id = updates.id")',
    
}


keywords_mapping: Dict[str, Dict[str, str]] = {
    'select': {
        'nome': 'firstname',
        'idade': 'lastname',
        'produto': 'product',
        'preco': 'price',
        
    },
    'update': {
  
    },
    'merge': {
        
    },
    
}

def preprocess_text(text: str) -> str:
    text = text.lower()
    text = unidecode.unidecode(text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = ' '.join(text.split())
    doc = nlp(text)
    filtered_tokens = [token.lemma_ for token in doc if token.is_alpha and token.lemma_ not in STOP_WORDS]
    processed_text = " ".join(filtered_tokens)
    return processed_text

def create_embeddings(texts: List[str]) -> np.ndarray:
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = BertModel.from_pretrained('bert-base-multilingual-cased')
    
    embeddings = []
    
    for text in texts:
        input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
        with torch.no_grad():
            output = model(input_ids)
        
        embeddings.append(output.last_hidden_state.mean(dim=1).numpy().flatten()) 
    
    return np.array(embeddings)

def classify_product(query_text: str, keywords_mapping: Dict[str, Dict[str, str]]) -> str:
    query_text = preprocess_text(query_text)
    for query_type, keywords in keywords_mapping.items():
        for keyword, category in keywords.items():
            if keyword in query_text:
                return category
    return 'not_identified'

def convert_sql_to_pyspark(sql_query: str) -> str:

    match = re.match(r'^\s*(\w+)\s+', sql_query)
    if match:
        sql_type = match.group(1).lower()
        if sql_type in sql_to_pyspark_mapping:
            pyspark_query = sql_to_pyspark_mapping[sql_type]
            
          
            fields_match = re.search(r'\bselect\b(.*?)\bfrom\b', sql_query, re.DOTALL | re.IGNORECASE)
            if fields_match:
                fields = fields_match.group(1).strip()
                fields = re.sub(r'(\w+)\s+AS\s+(\w+)', r'"\1", "\2"', fields)
                pyspark_query += f"({fields})"
            
            return pyspark_query
    return 'not_identified'

def main() -> None:
    treino = pd.read_csv(r"C:\mlflowjobs\treino.csv", sep=";")
    avaliacao = pd.read_csv(r"C:\mlflowjobs\avaliacao.csv", sep=";")
    
    treino['titulo_processed'] = treino['querysql'].apply(preprocess_text)
    avaliacao['titulo_processed'] = avaliacao['sql'].apply(preprocess_text)
    
    treino_embeddings = create_embeddings(treino['titulo_processed'])
    avaliacao_embeddings = create_embeddings(avaliacao['titulo_processed'])
    
    avaliacao['tipo'] = ''
    avaliacao['pyspark_query'] = ''
    
    for i in range(len(avaliacao)):
        query_text = avaliacao.loc[i, 'sql']
        tipo_consulta = classify_product(query_text, keywords_mapping)
        pyspark_query = convert_sql_to_pyspark(query_text)
        
        if tipo_consulta:
            avaliacao.at[i, 'tipo'] = tipo_consulta
        else:
            avaliacao.at[i, 'tipo'] = 'not_identified'
          
        if pyspark_query:
            avaliacao.at[i, 'pyspark_query'] = pyspark_query
        else:
            avaliacao.at[i, 'pyspark_query'] = 'not_identified'
          
    print(avaliacao[['sql', 'tipo', 'pyspark_query']])

if __name__ == "__main__":
    main()


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predi

                                                 sql            tipo  \
0                      select nome, peso from pessoa       firstname   
1  SELECT Name, ProductNumber, ListPrice AS Price...  not_identified   
2  SELECT p.Name AS ProductName, \nNonDiscountSal...  not_identified   

                                       pyspark_query  
0                              df.select(nome, peso)  
1  df.select(Name, ProductNumber, "ListPrice", "P...  
2  df.select(p."Name", "ProductName", \nNonDiscou...  
