In [2]:
# Note: need to install psycopg2 from source if using in production environment
# https://www.psycopg.org/docs/install.html
# %pip install sqlglot sqlvalidator sqlalchemy psycopg2-binary sqlfluff mysql-connector-python pyodbc google-cloud-bigquery
# %pip install pyflakes pylint parso flake8 mypy ruff
# %pip install docker

In [65]:
from importlib import reload
import sql_parsers
reload(sql_parsers)

import python_parsers
reload(python_parsers)

<module 'python_parsers' from '/mnt/foundation-shared/nina_xu_gretel_ai/navigator-helpers/experiments/code_validation/python_parsers.py'>

In [66]:
import docker
import pandas as pd
import time

from functools import partial

from python_parsers import *
from sql_parsers import *

In [2]:
sql_queries = pd.read_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/sql_queries_w_dialect_1000.csv')
sql_queries_googlesql = pd.read_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/sql_queries_googlesql_200.csv')
sql_queries = pd.concat([sql_queries, sql_queries_googlesql], ignore_index=True)
python_typscript_codes = pd.read_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/python_typescript_codes.csv')
python_codes = pd.read_json('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/text_to_python_v1.json')

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 100)

# Give each row a unique db name because otherwise BigQuery struggles with the same db name
sql_queries['id_tmp'] = sql_queries.index
sql_queries['db_name'] = sql_queries.apply(lambda x: f"db_{x.id_tmp}", axis=1)

# Basic cleaning. At least BigQuery errors out if there are newlines like 'CREATE\nTABLE'
sql_queries['SQL Query'] = sql_queries['SQL Query'].apply(lambda x: x.replace('\n', ' '))
sql_queries['Context'] = sql_queries['Context'].apply(lambda x: x.replace('\n', ' '))

# SQL Code Validation

In [36]:
sql_queries.head(1)
print(sql_queries.Dialect.value_counts())

sql_queries.tail(1)

Dialect
SQL Server            230
PostgreSQL            220
SQLite                209
GoogleSQL             209
MySQL                 196
Oracle SQL             42
OracleSQL              42
Oracle                 37
Oracle SQL Dialect      1
Name: count, dtype: int64


Unnamed: 0,ID,Natural Language Prompt,Context,SQL Query,Domain,Topic,Dialect,Complexity
1185,745d3e2c-8f57-4a3a-a7a5-d1f225a575de,What is the average performance score by\ndepartment for the past year?,"CREATE TABLE performance_reviews ( review_id\nSTRING NOT NULL, employee_id STRING NOT NULL,\ndepartment STRING NOT NULL, review_date DATE\nNOT NULL, performance_score INTEGER NOT NULL\n); CREATE TABLE employees ( employee_id\nSTRING NOT NULL, first_name STRING NOT NULL,\nlast_name STRING NOT NULL, hire_date DATE NOT\nNULL, department STRING NOT NULL );","SELECT department, AVG(performance_score) as\naverage_score FROM performance_reviews WHERE\nreview_date BETWEEN DATE_SUB(CURRENT_DATE(),\nINTERVAL 1 YEAR) AND CURRENT_DATE() GROUP BY\ndepartment;",Human Resources,Performance Management,GoogleSQL,4


In [47]:
sql_queries.Complexity.value_counts()

Complexity
3    689
2    447
4     35
1     15
Name: count, dtype: int64

In [97]:
"""
Have a PostgreSQL database running in a Docker container. In command line, run the following commands:
# Grant access to non-root users so that the python client will work
> sudo groupadd docker
> sudo usermod -aG docker $USER
> newgrp docker

> docker pull postgres
> docker run --name my-postgres \
  -e POSTGRES_USER=myuser \
  -e POSTGRES_PASSWORD=mypassword \
  -e POSTGRES_DB=mydatabase \
  -p 5433:5432 \
  -d postgres

"""
client = docker.from_env()

# List all running containers
containers = client.containers.list(all=False)
# Get the postgres container
postgres_container = client.containers.get('my-postgres')
# Get container's gateway, not that it's not the "IPAddress" field
postgres_container_gateway = postgres_container.attrs['NetworkSettings']['Gateway']
print(postgres_container_gateway)

postgres_db_creds = {
        "host": postgres_container_gateway,
        "port": 5433, # the default port is 5432, but that was already in use for me
        "user": "myuser",
        "password": "mypassword",
        "dbname": "my-postgres",
    }

172.17.0.1


In [98]:
"""
Have a MySQL database running in a Docker container. In command line, run the following commands:
> docker pull mysql
> docker run --name my-mysql \
  -e MYSQL_ROOT_PASSWORD=myrootpassword \
  -d mysql
"""

mysql_container = client.containers.get('my-mysql')
mysql_container_ip = mysql_container.attrs['NetworkSettings']['IPAddress']
print(mysql_container_ip)

mysql_db_creds = {
    "host": mysql_container_ip,
    "port": 3306, # default port for mysql
    "user": "root",
    "password": "myrootpassword",
}

172.17.0.3


In [99]:
"""
Have a Microsoft SQL Server database running in a Docker container. In command line, run the following commands:
$ docker pull mcr.microsoft.com/mssql/server
$ docker run --name my-sqlserver \
  -e 'ACCEPT_EULA=Y' -e 'MSSQL_SA_PASSWORD=myRoot(!)Password' \
  -p 1433:1433 \
  -d mcr.microsoft.com/mssql/server

$ sudo apt install unixodbc-dev

Install the SQL Server command-line tool (sqlcmd) inside the container:
$ docker exec -it --user root my-sqlserver bash
# apt-get update
# apt-get install -y mssql-tools unixodbc-dev
"""
          
sqlserver_container = client.containers.get('my-sqlserver')
sqlserver_container_ip = sqlserver_container.attrs['NetworkSettings']['IPAddress']
print(sqlserver_container_ip)

sqlserver_db_creds = {
    "host": sqlserver_container_ip,
    "port": 1433, # default port for sql server,
    "user": "sa",
    "password": "myRoot(!)Password",
}

172.17.0.4


In [100]:
"""
Have a BigQuery emulator running in a Docker container. The official BigQuery image requires authentication 
to Google Cloud and would actually interact with BigQuery. In command line, run the following commands:

$ docker pull ghcr.io/goccy/bigquery-emulator:latest
$ docker run -it -p 9050:9050 ghcr.io/goccy/bigquery-emulator:latest --project=test-project

Note: if running the same SQL queries again, kill the container and start a fresh one because 
the deleting dataset functionality was not working as expected.
"""

biquery_db_creds = {
     "port": 9050,
     "project": "test-project",
}

In [101]:
#  Apply different SQL validators to the SQL queries
def is_valid_query_and_schema(row, func):
    query_check = func(row['SQL Query'])
    schema_check = func(row['Context'])
    is_valid_schema = schema_check[0]
    is_valid_query = query_check[0]
    is_valid_sql = is_valid_schema and is_valid_query
    error_messages = f"***Schema error: {schema_check[1]}" if not is_valid_schema else ''
    error_messages += f"***Query error: {query_check[1]}" if not is_valid_query else ''
    return is_valid_sql, is_valid_schema, is_valid_query, error_messages

def is_valid_query_and_schema_with_sqlfluff(row):
    dialect_map = {
        'SQLite': 'sqlite',
        'PostgreSQL': 'postgres',
        'MySQL': 'mysql',
        'SQL Server': 'tsql',
        'GoogleSQL': 'bigquery',
        'Oracle': 'oracle',
    }
    if 'Oracle' in row['Dialect']:
        dialect = 'oracle'
    else:
        dialect = dialect_map.get(row['Dialect'], 'ansi')
    query_check = SimpleSqlValidator.is_valid_sql_with_sqlfluff(row['SQL Query'], dialect)
    schema_check = SimpleSqlValidator.is_valid_sql_with_sqlfluff(row['Context'], dialect)
    is_valid_schema = schema_check[0]
    is_valid_query = query_check[0]
    is_valid_sql = is_valid_schema and is_valid_query
    error_messages = f"***Schema error: {schema_check[1]}" if not is_valid_schema else ''
    error_messages += f"***Query error: {query_check[1]}" if not is_valid_query else ''
    return is_valid_sql, is_valid_schema, is_valid_query, error_messages

def check_query_and_schema_separately(sql_queries, method):
    start_time = time.time()
    functions_to_apply = {
        'sqlglot': partial(is_valid_query_and_schema, func=SimpleSqlValidator.is_valid_sql_with_sqlglot),
        'sqlquery': partial(is_valid_query_and_schema, func=SimpleSqlValidator.is_valid_sql_with_sqlquery),
        'sqlfluff': is_valid_query_and_schema_with_sqlfluff,
    }

    result = sql_queries.apply(functions_to_apply[method], axis=1).apply(list)
    sql_queries[f'is_valid_sql_with_{method}'] = result.apply(lambda x: x[0])
    sql_queries[f'is_valid_schema_with_{method}'] = result.apply(lambda x: x[1])
    sql_queries[f'is_valid_query_with_{method}'] = result.apply(lambda x: x[2])
    sql_queries[f'error_msgs_{method}'] = result.apply(lambda x: x[3])

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"{method} check executed in {elapsed_time:.2f} seconds")

    return sql_queries


def check_query_against_schema(row, dialect):

    validator_classes = {
        'SQLite': SqliteValidator,
        'PostgreSQL': PostgresqlValidator,
        'MySQL': MysqlValidator,
        'SQL Server': SqlserverValidator,
        'GoogleSQL': GooglesqlValidator,
    }

    kwargs_postgres = {
        'domain': row['Topic'],
        'db_creds': postgres_db_creds,
    }
    kwargs_mysql = {
        'domain': row['Topic'],
        'db_creds': mysql_db_creds,
        'mysql_container': mysql_container,
    }
    kwargs_sqlserver = {
        'domain': row['Topic'],
        'db_creds': sqlserver_db_creds,
        'sqlserver_container': sqlserver_container,
    }
    kwargs_bigquery = {
        'domain': row['db_name'],
        'db_creds': biquery_db_creds,
    }

    all_kwargs = {
        'SQLite': {},
        'PostgreSQL': kwargs_postgres,
        'MySQL': kwargs_mysql,
        'SQL Server': kwargs_sqlserver,
        'GoogleSQL': kwargs_bigquery
    }

    dialect_name = dialect.lower().replace(' ', '')

    if row['Dialect'] == dialect:
        result = validator_classes[dialect].is_valid_sql(
            row['SQL Query'], row['Context'], **all_kwargs[dialect]
            )
    else:
        result = None, None
    
    row[f'is_valid_{dialect_name}'] = result[0]
    row[f'error_msg_{dialect_name}'] = result[1]
    
    return row

def apply_check_query_against_schema(sql_queries, dialect):
    start_time = time.time()
    sql_queries = sql_queries.apply(check_query_against_schema, dialect=dialect, axis=1)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"{dialect} check executed in {elapsed_time:.2f} seconds")
    return sql_queries


In [117]:
sql_queries = check_query_and_schema_separately(sql_queries, 'sqlfluff')
# sql_queries = check_query_and_schema_separately(sql_queries, 'sqlglot')
# sql_queries = check_query_and_schema_separately(sql_queries, 'sqlquery')

# sql_queries = apply_check_query_against_schema(sql_queries, 'SQLite')
# sql_queries = apply_check_query_against_schema(sql_queries, 'PostgreSQL')
# sql_queries = apply_check_query_against_schema(sql_queries, 'MySQL')
# sql_queries = apply_check_query_against_schema(sql_queries, 'SQL Server')
# sql_queries = apply_check_query_against_schema(sql_queries, 'GoogleSQL')

print(sql_queries.is_valid_sql_with_sqlglot.value_counts(normalize=True))
print(sql_queries.is_valid_sql_with_sqlquery.value_counts(normalize=True))
print(sql_queries.is_valid_sql_with_sqlfluff.value_counts(normalize=True))

print(sql_queries.is_valid_sqlite.value_counts(normalize=True))
print(sql_queries.is_valid_postgresql.value_counts(normalize=True))
print(sql_queries.is_valid_mysql.value_counts(normalize=True))
print(sql_queries.is_valid_sqlserver.value_counts(normalize=True))
print(sql_queries.is_valid_googlesql.value_counts(normalize=True))


sqlfluff check executed in 250.23 seconds
is_valid_sql_with_sqlglot
True     0.970489
False    0.029511
Name: proportion, dtype: float64
is_valid_sql_with_sqlquery
True     0.994098
False    0.005902
Name: proportion, dtype: float64
is_valid_sql_with_sqlfluff
True     0.956155
False    0.043845
Name: proportion, dtype: float64
is_valid_sqlite
True     0.980861
False    0.019139
Name: proportion, dtype: float64
is_valid_postgresql
True     0.877273
False    0.122727
Name: proportion, dtype: float64
is_valid_mysql
True     0.938776
False    0.061224
Name: proportion, dtype: float64
is_valid_sqlserver
True     0.83913
False    0.16087
Name: proportion, dtype: float64
is_valid_googlesql
True     0.679426
False    0.320574
Name: proportion, dtype: float64


In [105]:
def get_googlesql_error_categories(error_msg):
    if not error_msg:
        return None
    googlesql_error_categories = ['Type not found', 'Syntax error', 'Foreign keys are not supported', 'does not support']
    for category in googlesql_error_categories:
        if category.lower() in error_msg.lower():
            return category


sql_queries['googlesql_error_category'] = sql_queries['error_msg_googlesql'].apply(get_googlesql_error_categories)

remaining = sql_queries[(sql_queries.is_valid_googlesql == False) & (sql_queries.googlesql_error_category.isnull())][['SQL Query', 'Context', 'error_msg_googlesql']]
print(sql_queries.googlesql_error_category.value_counts())
print(remaining.count())
# remaining.head()

googlesql_error_category
Type not found                    46
Foreign keys are not supported     9
does not support                   5
Syntax error                       2
Name: count, dtype: int64
SQL Query              5
Context                5
error_msg_googlesql    5
dtype: int64


In [103]:
sql_queries.to_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/sqlqueries_1200_validated_092524.csv', index=False)

In [106]:
dialects = ['SQLite', 'PostgreSQL', 'MySQL', 'SQL Server', 'GoogleSQL']
for dialect in dialects:
    print(f"\n***{dialect}***")
    print(sql_queries[sql_queries['Dialect'] == dialect].is_valid_sql_with_sqlglot.value_counts(normalize=True))
    print(sql_queries[sql_queries['Dialect'] == dialect].is_valid_sql_with_sqlquery.value_counts(normalize=True))
    print(sql_queries[sql_queries['Dialect'] == dialect].is_valid_sql_with_sqlfluff.value_counts(normalize=True))


***SQLite***
is_valid_sql_with_sqlglot
True     0.985646
False    0.014354
Name: proportion, dtype: float64
is_valid_sql_with_sqlquery
True    1.0
Name: proportion, dtype: float64
is_valid_sql_with_sqlfluff
True     0.985646
False    0.014354
Name: proportion, dtype: float64

***PostgreSQL***
is_valid_sql_with_sqlglot
True     0.986364
False    0.013636
Name: proportion, dtype: float64
is_valid_sql_with_sqlquery
True     0.990909
False    0.009091
Name: proportion, dtype: float64
is_valid_sql_with_sqlfluff
True     0.918182
False    0.081818
Name: proportion, dtype: float64

***MySQL***
is_valid_sql_with_sqlglot
True     0.994898
False    0.005102
Name: proportion, dtype: float64
is_valid_sql_with_sqlquery
True     0.989796
False    0.010204
Name: proportion, dtype: float64
is_valid_sql_with_sqlfluff
True     0.969388
False    0.030612
Name: proportion, dtype: float64

***SQL Server***
is_valid_sql_with_sqlglot
True     0.886957
False    0.113043
Name: proportion, dtype: float64
is_va

In [16]:
methods = ['sqlglot', 'sqlquery', 'sqlfluff']
for dialect in dialects:
    print(f"\n***{dialect}***")
    for method in methods:
        print(f"***{method}***")
        print(sql_queries[sql_queries['Dialect'] == dialect][f'is_valid_sql_with_{method}'].value_counts(normalize=True))
        print(sql_queries[sql_queries['Dialect'] == dialect][f'is_valid_schema_with_{method}'].value_counts(normalize=True))
        print(sql_queries[sql_queries['Dialect'] == dialect][f'is_valid_query_with_{method}'].value_counts(normalize=True))


***SQLite***
***sqlglot***
is_valid_sql_with_sqlglot
True     0.990431
False    0.009569
Name: proportion, dtype: float64
is_valid_schema_with_sqlglot
True     0.995215
False    0.004785
Name: proportion, dtype: float64
is_valid_query_with_sqlglot
True     0.995215
False    0.004785
Name: proportion, dtype: float64
***sqlquery***
is_valid_sql_with_sqlquery
True    1.0
Name: proportion, dtype: float64
is_valid_schema_with_sqlquery
True    1.0
Name: proportion, dtype: float64
is_valid_query_with_sqlquery
True    1.0
Name: proportion, dtype: float64
***sqlfluff***
is_valid_sql_with_sqlfluff
True     0.990431
False    0.009569
Name: proportion, dtype: float64
is_valid_schema_with_sqlfluff
True     0.995215
False    0.004785
Name: proportion, dtype: float64
is_valid_query_with_sqlfluff
True     0.995215
False    0.004785
Name: proportion, dtype: float64
***PostgreSQL***
***sqlglot***
is_valid_sql_with_sqlglot
True     0.986425
False    0.013575
Name: proportion, dtype: float64
is_valid_sche

In [38]:
# Check if the query is valid with both sqlglot and sqlfluff
# SQLQuery is proven to be useless so not counting it in the aggregate
sql_queries['is_valid_sql_aggregate'] = sql_queries[['is_valid_sql_with_sqlglot', 'is_valid_sql_with_sqlfluff']].all(axis=1)
for dialect in dialects:
    print(f"***{dialect}***")
    print(sql_queries[sql_queries['Dialect'] == dialect].is_valid_sql_aggregate.value_counts(normalize=True))

***SQLite***
is_valid_sql_aggregate
True     0.990431
False    0.009569
Name: proportion, dtype: float64
***PostgreSQL***
is_valid_sql_aggregate
True     0.914027
False    0.085973
Name: proportion, dtype: float64
***MySQL***
is_valid_sql_aggregate
True     0.969543
False    0.030457
Name: proportion, dtype: float64
***SQL Server***
is_valid_sql_aggregate
True     0.882609
False    0.117391
Name: proportion, dtype: float64
***GoogleSQL***
is_valid_sql_aggregate
True     0.823529
False    0.176471
Name: proportion, dtype: float64


In [118]:
# What are the differences between checking against schema and validating the query separately from schema?
for dialect in dialects:
    print(f"\n***{dialect}***")
    dialect_name = dialect.lower().replace(' ', '')
    df = sql_queries[sql_queries['Dialect'] == dialect]
    print(pd.crosstab(df[f'is_valid_{dialect_name}'], df['is_valid_sql_with_sqlfluff']))


***SQLite***
is_valid_sql_with_sqlfluff  False  True 
is_valid_sqlite                         
False                           2      2
True                            1    204

***PostgreSQL***
is_valid_sql_with_sqlfluff  False  True 
is_valid_postgresql                     
False                          18      9
True                            0    193

***MySQL***
is_valid_sql_with_sqlfluff  False  True 
is_valid_mysql                          
False                           6      6
True                            0    184

***SQL Server***
is_valid_sql_with_sqlfluff  False  True 
is_valid_sqlserver                      
False                           1     36
True                            0    193

***GoogleSQL***
is_valid_sql_with_sqlfluff  False  True 
is_valid_googlesql                      
False                          14     53
True                            9    133


In [115]:
dialect = dialects[4]
dialect_name = dialect.lower().replace(' ', '')
print(dialect)
df = sql_queries[(sql_queries['Dialect'] == dialect) & 
                 ((sql_queries['is_valid_sql_with_sqlfluff'] == False) & 
                  (sql_queries[f'is_valid_{dialect_name}'] == True))]
df[['SQL Query', 'Context', f'error_msg_{dialect_name}', 'error_msgs_sqlfluff', 'error_msgs_sqlquery', 'error_msgs_sqlglot']].head()

GoogleSQL


Unnamed: 0,SQL Query,Context,error_msg_googlesql,error_msgs_sqlfluff,error_msgs_sqlquery,error_msgs_sqlglot
1033,"SELECT PolicyID, PolicyName, Description FROM PlatformPolicies WHERE LastUpdated > '2022-01-01';","CREATE TABLE PlatformPolicies ( PolicyID STRING NOT NULL, PolicyName STRING NOT NULL, Description STRING, LastUpdated DATE, PRIMARY KEY (PolicyID) );",,"***Schema error: PRS: Line 1, Position 31: Found unparsable section: '( PolicyID STRING NOT NULL, PolicyNa...'",,
1044,SELECT * FROM Security_Breaches WHERE EXTRACT(YEAR FROM reported_date) = EXTRACT(YEAR FROM CURRENT_DATE()) - 1;,"CREATE TABLE Security_Breaches ( breach_id STRING NOT NULL, description STRING, reported_date DATE, affected_customers INT64, PRIMARY KEY(breach_id) ); CREATE TABLE Telecommunications_Companies ( company_id STRING NOT NULL, company_name STRING, headquarters STRING, PRIMARY KEY(company_id) );",,"***Schema error: PRS: Line 1, Position 32: Found unparsable section: '( breach_id STRING NOT NULL, des...'",,
1074,"SELECT plant_name, capacity_mw FROM RenewableEnergyPlants;","CREATE TABLE RenewableEnergyPlants ( plant_id STRING NOT NULL, plant_name STRING, capacity_mw FLOAT, PRIMARY KEY(plant_id) );",,"***Schema error: PRS: Line 1, Position 36: Found unparsable section: '( plant_id STRING NOT NULL, plan...'",,
1094,SELECT MAX(close) AS highest_closing_price FROM stocks WHERE ticker = 'AAPL' AND EXTRACT(YEAR FROM date) = 2022;,"CREATE TABLE stocks ( ticker STRING NOT NULL, date DATE NOT NULL, open FLOAT64, high FLOAT64, low FLOAT64, close FLOAT64, volume INT64, PRIMARY KEY (ticker, date) ); CREATE TABLE companies ( ticker STRING NOT NULL, name STRING NOT NULL, sector STRING, industry STRING, PRIMARY KEY (ticker) );",,"***Schema error: PRS: Line 1, Position 21: Found unparsable section: '( ticker STRING NOT NULL, date DATE ...'",,
1097,SELECT COUNT(*) FROM BMI_View WHERE BMI > 25;,"CREATE TABLE Patients ( patient_id STRING, name STRING, age INT64, height FLOAT64, weight FLOAT64, PRIMARY KEY(patient_id) ); CREATE VIEW BMI_View AS SELECT patient_id, weight / (height * height) AS BMI FROM Patients;",,"***Schema error: PRS: Line 1, Position 23: Found unparsable section: '( patient_id STRING, name STRING, ag...'",,


In [49]:
print(sql_queries['Context'].loc[5])

5    CREATE TABLE Properties (PropertyID INT PRIMARY KEY, Address NVARCHAR(255), OwnerID INT); CREATE TABLE Rentals (RentalID INT PRIMARY KEY, PropertyID INT FOREIGN KEY REFERENCES Properties(PropertyID), TenantID INT, RentAmount DECIMAL(10,2), RentDate DATE); CREATE TABLE Owners (OwnerID INT PRIMARY KEY, OwnerName NVARCHAR(255));
5                                                                                                                                                                                                                       CREATE TABLE Machines (\n  machine_id SERIAL PRIMARY KEY,\n  machine_name VARCHAR(255),\n  last_active_date DATE\n);
Name: Context, dtype: object


# Python Code Validation

In [74]:
python_check_methods = {
    # 'compile': is_valid_python_with_complie,
    # 'ast': is_valid_python_with_ast,
    # 'pyflakes': is_valid_python_with_pyflakes,
    # 'parso': is_valid_python_with_parso,
    # 'mypy': is_valid_python_with_mypy,
    'ruff': is_valid_python_with_ruff,
    # 'ruff_extensive': is_valid_python_with_ruff,
    # 'ruff_pyflakes': is_valid_python_with_ruff,
    # 'pylint': is_valid_python_with_pylint,
}

def check_python_code_with_method(df, method='compile', **kwargs):
    start_time = time.time()

    func = python_check_methods[method]
    df[f'check_{method}'] = df['code'].apply(func, **kwargs)
    df[f'is_valid_python_with_{method}'] = df[f'check_{method}'].apply(lambda x: x[0])
    df[f'{method}_error'] = df[f'check_{method}'].apply(lambda x: x[1])

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"\n{method} check executed in {elapsed_time:.2f} seconds")
    
    return df

In [75]:

for method in python_check_methods.keys():
    python_codes = check_python_code_with_method(python_codes, method)

# python_codes = check_python_code_with_method(python_codes, 'ruff_extensive', level='warning')
# ruff_pyflakes_args = {
#     'level': 'custom',
#     'ruff_rules': ['F'],
# }
# python_codes = check_python_code_with_method(python_codes, 'ruff_pyflakes', **ruff_pyflakes_args)

for method in python_check_methods.keys():
    print(python_codes[f'is_valid_python_with_{method}'].value_counts())


ruff check executed in 169.22 seconds
is_valid_python_with_ruff
True     927
False     73
Name: count, dtype: int64


In [76]:
if 'pylint_error' in python_codes.columns:
    python_codes['pylint_score'] = python_codes['pylint_error'].apply(lambda x: x['score'] if x else None)
    python_codes['pylint_severity'] = python_codes['pylint_error'].apply(lambda x: x['severity'] if x else None)
    python_codes['pylint_messages'] = python_codes['pylint_error'].apply(lambda x: x['messages'] if x else None)

print(python_codes.pylint_severity.value_counts())
print(python_codes.groupby('pylint_severity')['pylint_score'].mean())

print(pd.crosstab(python_codes['is_valid_python_with_ruff'], python_codes['is_valid_python_with_pylint']))
    

pylint_severity
refactor      238
convention     98
error          91
Name: count, dtype: int64
pylint_severity
convention    5.140116
error         1.634140
refactor      5.155952
Name: pylint_score, dtype: float64
is_valid_python_with_pylint  False  True 
is_valid_python_with_ruff                
False                           73      0
True                            18    909


In [71]:
print(python_codes['code'].loc[648])

import pandas as pd
from datetime import datetime, timedelta

def calculate_launch_window(input_csv, output_csv):
    # Read the input CSV file
    data = pd.read_csv(input_csv)

    # Define a function to calculate the time difference between celestial bodies
    def time_difference(ra1, dec1, ra2, dec2):
        # Calculate the time difference based on the positions of the celestial bodies
        # This is a simplified version, actual calculations would require more complex formulas
        diff = abs(ra1 - ra2) + abs(dec1 - dec2)
        return timedelta(hours=diff)

    # Calculate the time differences between the object and each celestial body
    celestial_bodies = ['Moon', 'Sun', 'Jupiter', 'Saturn']  # Add more celestial bodies as needed
    for body in celestial_bodies:
        data[f'Time to {body}'] = data.apply(lambda row: time_difference(row['Right Ascension'], row['Declination'], body['Right Ascension'], body['Declination']), axis=1)

    # Calculate the optimal launch w

In [61]:
# method = 'pylint'
# python_codes[python_codes[f'is_valid_python_with_{method}'] == False][['code', 
# 'compile_error', '' f'{method}_error']].head(10)
# python_codes[python_codes['is_valid_python_with_pylint'] == False][['code', 'pyflakes_error', 'ruff_error', 'pylint_score', 'pylint_severity', 'pylint_messages']].head(10)

In [72]:
python_codes[(python_codes.is_valid_python_with_ruff == False) & (python_codes.is_valid_python_with_pylint == True)][['code', 'ruff_error' ,'pylint_messages']].head(20)

Unnamed: 0,code,ruff_error,pylint_messages
27,"import asyncio\nimport websockets\nimport json\nimport numpy as np\n\n# Server component\nasync def handle_connection(websocket, path):\n async for message in websocket:\n data = json.loads(message)\n # Process data and update shared state\n # Broadcast updates to connected clients\n\n# Client component\nasync def draw(websocket, path):\n # Initialize canvas\n canvas = np.zeros((800, 800, 3), dtype=np.uint8)\n # Listen for updates from the server\n async for message in websocket:\n data = json.loads(message)\n # Update canvas based on received data\n # Display updated canvas\n\n# Run server and client\nstart_server = websockets.serve(handle_connection, ""localhost"", 8765)\nasyncio.get_event_loop().run_until_complete(start_server)\nasyncio.get_event_loop().run_forever()","['F841', 'F841', 'F841'] ['Local variable `data` is assigned to but never used', 'Local variable `canvas` is assigned to but never used', 'Local variable `data` is assigned to but never used']","[{'type': 'warning', 'symbol': 'unused-argument', 'message': 'Unused argument 'path''}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'data''}, {'type': 'warning', 'symbol': 'unused-argument', 'message': 'Unused argument 'path''}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'canvas''}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'data''}, {'type': 'convention', 'symbol': 'missing-final-newline', 'message': 'Final newline missing'}, {'type': 'convention', 'symbol': 'missing-module-docstring', 'message': 'Missing module docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'}, {'type': 'convention', 'symbol': 'wrong-import-order', 'message': 'standard import ""json"" should be placed before third party import..."
62,"import pandas as pd\nimport numpy as np\n\nclass InsuranceUnderwriting:\n def __init__(self, age, health_conditions, lifestyle_habits, claim_data):\n self.age = age\n self.health_conditions = health_conditions\n self.lifestyle_habits = lifestyle_habits\n self.claim_data = claim_data\n\n if not all([isinstance(i, (int, float)) for i in [age, health_conditions, lifestyle_habits]]):\n raise ValueError('Invalid input. Age, health conditions, and lifestyle habits must be numerical.')\n\n if not isinstance(claim_data, pd.DataFrame):\n raise ValueError('Invalid input. Claim data must be a pandas DataFrame.')\n\n def calculate_risk_score(self):\n # Here you would implement your risk scoring algorithm, which is not provided in this example\n risk_score = np.random.uniform(0, 10)\n return risk_score\n\n def determine_premium(self):\n risk_score = self.calculate_risk_score()\n # Here yo...",['F841'] ['Local variable `risk_score` is assigned to but never used'],"[{'type': 'warning', 'symbol': 'redefined-outer-name', 'message': 'Redefining name 'premium' from outer scope (line 31)'}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'risk_score''}, {'type': 'warning', 'symbol': 'broad-exception-caught', 'message': 'Catching too general exception Exception'}, {'type': 'convention', 'symbol': 'line-too-long', 'message': 'Line too long (102/100)'}, {'type': 'convention', 'symbol': 'line-too-long', 'message': 'Line too long (110/100)'}, {'type': 'convention', 'symbol': 'line-too-long', 'message': 'Line too long (101/100)'}, {'type': 'convention', 'symbol': 'line-too-long', 'message': 'Line too long (134/100)'}, {'type': 'convention', 'symbol': 'missing-final-newline', 'message': 'Final newline missing'}, {'type': 'convention', 'symbol': 'missing-module-docstring', 'message': 'Missing module docstring'}, {'type': 'convention', 'symbol': 'missing-class-docstring', 'message': 'Missing class docstring'}, {'type': 'convent..."
69,"import threading\nimport queue\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport scipy.stats as stats\n\n# Define a worker function to process data streams\ndef worker(data_queue):\n while True:\n data = data_queue.get()\n # Perform analytics on the data\n # ...\n # Update network settings based on analytics results\n # ...\n data_queue.task_done()\n\n# Create a queue to hold data streams\ndata_queue = queue.Queue()\n\n# Create and start multiple worker threads\nfor i in range(4):\n t = threading.Thread(target=worker, args=(data_queue,))\n t.start()\n\n# Generate and process synthetic data\nfor _ in range(100):\n data = pd.DataFrame({'signal_strength': stats.norm.rvs(size=1000),\n 'latency': stats.uniform.rvs(size=1000),\n 'packet_loss': stats.binom.rvs(100, 0.05, size=1000)})\n data_queue.put(data)\n\n# Wait for all tasks in the queue to be processed\ndata_queue.joi...",['F841'] ['Local variable `data` is assigned to but never used'],"[{'type': 'warning', 'symbol': 'redefined-outer-name', 'message': 'Redefining name 'data_queue' from outer scope (line 18)'}, {'type': 'warning', 'symbol': 'redefined-outer-name', 'message': 'Redefining name 'data' from outer scope (line 27)'}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'data''}, {'type': 'convention', 'symbol': 'missing-final-newline', 'message': 'Final newline missing'}, {'type': 'convention', 'symbol': 'missing-module-docstring', 'message': 'Missing module docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'}, {'type': 'refactor', 'symbol': 'consider-using-from-import', 'message': 'Use 'from scipy import stats' instead'}]"
131,"import pandas as pd\n\ndef vehicle_diagnostics(vehicle_data, log_file):\n mileage = vehicle_data[""mileage""]\n engine_temperature = vehicle_data[""engine_temperature""]\n oil_level = vehicle_data[""oil_level""]\n battery_voltage = vehicle_data[""battery_voltage""]\n\n df = pd.read_csv(log_file, header=None)\n df.columns = [""timestamp"", ""mileage"", ""engine_temperature"", ""oil_level"", ""battery_voltage""]\n average_engine_temperature = df[""engine_temperature""].tail(5).mean()\n\n diagnostic_report = f""""""\n Diagnostic Report:\n Mileage: {mileage}\n Average Engine Temperature (last 5 records): {average_engine_temperature}\n Oil Level: {oil_level}\n Battery Voltage: {battery_voltage}\n """"""\n\n with open(log_file, ""a"") as file:\n file.write(diagnostic_report)\n\n return diagnostic_report",['F841'] ['Local variable `engine_temperature` is assigned to but never used'],"[{'type': 'warning', 'symbol': 'unspecified-encoding', 'message': 'Using open without explicitly specifying an encoding'}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'engine_temperature''}, {'type': 'convention', 'symbol': 'missing-final-newline', 'message': 'Final newline missing'}, {'type': 'convention', 'symbol': 'missing-module-docstring', 'message': 'Missing module docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'}]"
150,"import concurrent.futures\nimport subprocess\nimport pandas as pd\nimport scikit_learn as sklearn\n\n# Define a function to perform a vulnerability scan on a given network asset\ndef scan_vulnerabilities(asset):\n # Run the vulnerability scanning tool and capture the output\n output = subprocess.check_output(['vuln_scanner', asset])\n # Parse the output and return a list of vulnerabilities\n vulnerabilities = parse_output(output)\n return vulnerabilities\n\n# Define a function to parse the output of the vulnerability scanner\ndef parse_output(output):\n # Implement parsing logic here\n vulnerabilities = []\n # Add parsed vulnerabilities to the list\n return vulnerabilities\n\n# Define a function to generate code for handling a specific vulnerability type\ndef generate_remediation_code(vulnerability):\n # Implement code generation logic here\n remediation_code = """"\n return remediation_code\n\n# Define a function to prioritize vulnerabilities base...",['F841'] ['Local variable `remediation_code` is assigned to but never used'],"[{'type': 'warning', 'symbol': 'unused-argument', 'message': 'Unused argument 'output''}, {'type': 'warning', 'symbol': 'unused-argument', 'message': 'Unused argument 'vulnerability''}, {'type': 'warning', 'symbol': 'unused-argument', 'message': 'Unused argument 'vulnerabilities''}, {'type': 'warning', 'symbol': 'unused-argument', 'message': 'Unused argument 'prioritized_vulnerabilities''}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'remediation_code''}, {'type': 'warning', 'symbol': 'unused-import', 'message': 'Unused pandas imported as pd'}, {'type': 'warning', 'symbol': 'unused-import', 'message': 'Unused scikit_learn imported as sklearn'}, {'type': 'convention', 'symbol': 'missing-final-newline', 'message': 'Final newline missing'}, {'type': 'convention', 'symbol': 'missing-module-docstring', 'message': 'Missing module docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'},..."
169,"import numpy as np\nimport pandas as pd\nfrom datetime import datetime\n\nclass Transaction:\n def __init__(self, timestamp, amount, account_id):\n if not isinstance(timestamp, datetime) or not isinstance(amount, (int, float)) or not isinstance(account_id, str):\n raise ValueError(""Invalid transaction data"")\n self.timestamp = timestamp\n self.amount = amount\n self.account_id = account_id\n\nclass FraudDetection:\n def __init__(self, transactions):\n if not all(isinstance(t, Transaction) for t in transactions):\n raise ValueError(""Invalid list of transactions"")\n self.transactions = transactions\n\n def calculate_average_amount(self):\n return np.mean([t.amount for t in self.transactions])\n\n def identify_high_amount_transactions(self, threshold):\n return [t for t in self.transactions if t.amount > threshold]\n\n def flag_suspicious_patterns(self):\n # For simplicity, let's flag t...",['F841'] ['Local variable `timestamps` is assigned to but never used'],"[{'type': 'warning', 'symbol': 'redefined-outer-name', 'message': 'Redefining name 'transactions' from outer scope (line 36)'}, {'type': 'warning', 'symbol': 'redefined-outer-name', 'message': 'Redefining name 'suspicious_transactions' from outer scope (line 41)'}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'timestamps''}, {'type': 'warning', 'symbol': 'unused-import', 'message': 'Unused pandas imported as pd'}, {'type': 'convention', 'symbol': 'line-too-long', 'message': 'Line too long (122/100)'}, {'type': 'convention', 'symbol': 'missing-final-newline', 'message': 'Final newline missing'}, {'type': 'convention', 'symbol': 'missing-module-docstring', 'message': 'Missing module docstring'}, {'type': 'convention', 'symbol': 'missing-class-docstring', 'message': 'Missing class docstring'}, {'type': 'convention', 'symbol': 'missing-class-docstring', 'message': 'Missing class docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', ..."
203,"import asyncio\nimport pandas as pd\nfrom concurrent.futures import ThreadPoolExecutor\n\n# Simulated supplier data feeds\nsupplier_data_feeds = {\n 'Supplier1': 'data1.csv',\n 'Supplier2': 'data2.csv',\n 'Supplier3': 'data3.csv'\n}\n\n# Function to process data from a supplier\ndef process_supplier_data(supplier, file):\n data = pd.read_csv(file)\n # Process data (e.g., update inventory database)\n # ...\n return data\n\n# Function to handle real-time updates from suppliers\nasync def handle_supplier_updates():\n with ThreadPoolExecutor(max_workers=3) as executor:\n tasks = {executor.submit(process_supplier_data, supplier, file): supplier for supplier, file in supplier_data_feeds.items()}\n for future in asyncio.as_completed(tasks):\n supplier = tasks[future]\n try:\n data = await future\n # Process data (e.g., update inventory database)\n # ...\n except Exception as...",['F841'] ['Local variable `data` is assigned to but never used'],"[{'type': 'warning', 'symbol': 'unused-argument', 'message': 'Unused argument 'supplier''}, {'type': 'warning', 'symbol': 'broad-exception-caught', 'message': 'Catching too general exception Exception'}, {'type': 'warning', 'symbol': 'f-string-without-interpolation', 'message': 'Using an f-string that does not have any interpolated variables'}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'data''}, {'type': 'convention', 'symbol': 'line-too-long', 'message': 'Line too long (132/100)'}, {'type': 'convention', 'symbol': 'missing-final-newline', 'message': 'Final newline missing'}, {'type': 'convention', 'symbol': 'missing-module-docstring', 'message': 'Missing module docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'}, {'type': 'convention', 'symbol': 'wrong-import-..."
247,"import getpass\n\ndef phish_simulation():\n print(""Welcome to the login page!"")\n username = input(""Username: "")\n password = getpass.getpass(""Password: "")\n print(""\nYou've just been phished! Please be aware."")\n\nphish_simulation()","['F841', 'F841'] ['Local variable `username` is assigned to but never used', 'Local variable `password` is assigned to but never used']","[{'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'username''}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'password''}, {'type': 'convention', 'symbol': 'missing-final-newline', 'message': 'Final newline missing'}, {'type': 'convention', 'symbol': 'missing-module-docstring', 'message': 'Missing module docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'}]"
265,"import numpy as np\nfrom sklearn.base import BaseEstimator\nfrom sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\nfrom sklearn.externals import joblib\n\nclass MachineLearningModel(BaseEstimator):\n def __init__(self):\n pass\n\n def train(self, dataset, preprocessing_options, model_parameters):\n # Implement the training logic using the provided dataset, preprocessing options, and model parameters\n pass\n\n def predict(self, new_data):\n # Implement the prediction logic using the trained model and the new data\n pass\n\n def evaluate(self, actual_output, predicted_output):\n # Implement the evaluation logic using the actual output and predicted output\n accuracy = accuracy_score(actual_output, predicted_output)\n precision = precision_score(actual_output, predicted_output)\n recall = recall_score(actual_output, predicted_output)\n f1 = f1_score(actual_output, predicted...",['F841'] ['Local variable `self` is assigned to but never used'],"[{'type': 'warning', 'symbol': 'broad-exception-caught', 'message': 'Catching too general exception Exception'}, {'type': 'warning', 'symbol': 'broad-exception-caught', 'message': 'Catching too general exception Exception'}, {'type': 'warning', 'symbol': 'self-cls-assignment', 'message': 'Invalid assignment to self in method'}, {'type': 'warning', 'symbol': 'unused-import', 'message': 'Unused numpy imported as np'}, {'type': 'convention', 'symbol': 'line-too-long', 'message': 'Line too long (110/100)'}, {'type': 'convention', 'symbol': 'missing-final-newline', 'message': 'Final newline missing'}, {'type': 'convention', 'symbol': 'missing-module-docstring', 'message': 'Missing module docstring'}, {'type': 'convention', 'symbol': 'missing-class-docstring', 'message': 'Missing class docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Mi..."
357,"import concurrent.futures\nimport pandas as pd\nfrom sklearn.ensemble import IsolationForest\n\ndef process_log(log_file):\n # Load the log data\n data = pd.read_csv(log_file)\n\n # Use a machine learning model to identify anomalies\n model = IsolationForest(contamination=0.01)\n predictions = model.fit_predict(data)\n\n # Add the predictions to the dataframe\n data['anomaly'] = predictions\n\n # Return the dataframe\n return data\n\ndef analyze_logs(log_files):\n # Create a ThreadPoolExecutor\n with concurrent.futures.ThreadPoolExecutor() as executor:\n # Use the executor to apply the process_log function to each log file\n futures = {executor.submit(process_log, log_file): log_file for log_file in log_files}\n\n # Process the results as they complete\n for future in concurrent.futures.as_completed(futures):\n log_file = futures[future]\n try:\n data = future.result()\n exc...",['F841'] ['Local variable `data` is assigned to but never used'],"[{'type': 'warning', 'symbol': 'redefined-outer-name', 'message': 'Redefining name 'log_files' from outer scope (line 37)'}, {'type': 'warning', 'symbol': 'broad-exception-caught', 'message': 'Catching too general exception Exception'}, {'type': 'warning', 'symbol': 'f-string-without-interpolation', 'message': 'Using an f-string that does not have any interpolated variables'}, {'type': 'warning', 'symbol': 'unused-variable', 'message': 'Unused variable 'data''}, {'type': 'convention', 'symbol': 'missing-final-newline', 'message': 'Final newline missing'}, {'type': 'convention', 'symbol': 'missing-module-docstring', 'message': 'Missing module docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'}, {'type': 'convention', 'symbol': 'missing-function-docstring', 'message': 'Missing function or method docstring'}]"


In [26]:

def get_error_category(error: str, error_categories: list) -> str:
    
    for category in error_categories:
        if error is not None:
            if category in str(error):
                return category
    return None

pyflakes_error_categories = ['undefined name', 'assigned to but never used', 'imported but unused']
python_codes['pyflakes_error_category'] = python_codes['pyflakes_error'].apply(get_error_category, error_categories=pyflakes_error_categories)
python_codes.loc[python_codes.is_valid_python_with_compile == False, 'pyflakes_error_category'] = 'Invalid Syntax'
python_codes.loc[(python_codes.is_valid_python_with_pyflakes == False) & (python_codes.pyflakes_error_category.isnull()), 'pyflakes_error_category'] = 'Other'

python_codes['pyflakes_error_category'].value_counts()


pyflakes_error_category
imported but unused           226
undefined name                 53
assigned to but never used     29
Invalid Syntax                 24
Other                           1
Name: count, dtype: int64

In [27]:
ruff_error_categories = ["{None}", "{'F821'}", "{'F822'}", "{'F823'}"]
python_codes['ruff_error_category'] = python_codes['ruff_error'].apply(get_error_category, error_categories=ruff_error_categories)
python_codes['ruff_error_category'].value_counts()

ruff_error_category
{'F821'}    48
{None}      24
{'F823'}     1
Name: count, dtype: int64

In [28]:
python_codes[(python_codes.is_valid_python_with_ruff == True) & (python_codes.pyflakes_error_category == 'undefined name')][['code', 'ruff_error', 'pyflakes_error']].head()

Unnamed: 0,code,ruff_error,pyflakes_error
67,"import asyncio\nimport concurrent.futures\nfrom scapy.all import *\nimport pandas as pd\nfrom sklearn.ensemble import IsolationForest\n\n# Function to monitor network traffic\ndef monitor_traffic(node):\n packets = sniff(iface=node, count=100)\n return packets\n\n# Function to analyze packets\ndef analyze_packets(packets):\n df = pd.DataFrame([packet.summary() for packet in packets])\n model = IsolationForest(contamination=0.1)\n df['anomaly'] = model.fit_predict(df)\n return df\n\n# Function to handle metaprogramming\ndef adapt_protocol(df):\n # Add logic to adapt to changing network conditions and protocols\n pass\n\n# Function to handle concurrency and parallel processing\nasync def monitor_and_analyze(nodes):\n with concurrent.futures.ThreadPoolExecutor() as executor:\n loop = asyncio.get_event_loop()\n tasks = [loop.run_in_executor(executor, monitor_traffic, node) for node in nodes]\n packets = await asyncio.gather(*tasks)\n ...",,"<string>:3:1: 'from scapy.all import *' used; unable to detect undefined names\n<string>:9:15: 'sniff' may be undefined, or defined from star imports: scapy.all\n"
240,"from vpython import *\n\n# Create a sphere for the head\nhead = sphere(pos=vector(0, 1, 0), radius=1, color=color.red)\n\n# Create cylinders for the body and arms\nbody = cylinder(pos=vector(0, 0, 0), axis=vector(0, -2, 0), radius=1, color=color.blue)\nleft_arm = cylinder(pos=vector(-1, 0, 0), axis=vector(0, -1, 0), radius=0.5, color=color.green)\nright_arm = cylinder(pos=vector(1, 0, 0), axis=vector(0, -1, 0), radius=0.5, color=color.green)\n\n# Create cylinders for the legs\nleft_leg = cylinder(pos=vector(-1, -2, 0), axis=vector(0, -2, 0), radius=0.75, color=color.orange)\nright_leg = cylinder(pos=vector(1, -2, 0), axis=vector(0, -2, 0), radius=0.75, color=color.orange)",,"<string>:1:1: 'from vpython import *' used; unable to detect undefined names\n<string>:4:8: 'sphere' may be undefined, or defined from star imports: vpython\n<string>:4:19: 'vector' may be undefined, or defined from star imports: vpython\n<string>:4:52: 'color' may be undefined, or defined from star imports: vpython\n<string>:7:8: 'cylinder' may be undefined, or defined from star imports: vpython\n<string>:7:21: 'vector' may be undefined, or defined from star imports: vpython\n<string>:7:43: 'vector' may be undefined, or defined from star imports: vpython\n<string>:7:77: 'color' may be undefined, or defined from star imports: vpython\n<string>:8:12: 'cylinder' may be undefined, or defined from star imports: vpython\n<string>:8:25: 'vector' may be undefined, or defined from star imports: vpython\n<string>:8:48: 'vector' may be undefined, or defined from star imports: vpython\n<string>:8:84: 'color' may be undefined, or defined from star imports: vpython\n<string>:9:13: 'cylinder' ma..."
663,"import asyncio\nimport pandas as pd\nfrom scapy.all import *\nfrom scapy.layers.http import HTTPRequest\n\n# Define a function to process packets concurrently\nasync def process_packet(packet):\n if HTTPRequest in packet:\n http_layer = packet.getlayer(HTTPRequest)\n url = http_layer.Host.decode()\n if ""malicious_url"" in url:\n print(f""Alert: Malicious URL detected: {url}"")\n\n# Define a function to sniff packets and process them concurrently\ndef sniff_packets():\n loop = asyncio.new_event_loop()\n asyncio.set_event_loop(loop)\n try:\n loop.run_until_complete(asyncio.gather(\n *[process_packet(packet) for packet in AsyncSniffer().sniff()]))\n except KeyboardInterrupt:\n pass\n finally:\n loop.close()\n\n# Start packet sniffing\nsniff_packets()",,"<string>:2:1: 'pandas as pd' imported but unused\n<string>:3:1: 'from scapy.all import *' used; unable to detect undefined names\n<string>:20:52: 'AsyncSniffer' may be undefined, or defined from star imports: scapy.all\n"
821,"from scapy.all import *\nfrom multiprocessing import Pool\nimport pandas as pd\n\ndef analyze_packet(packet):\n # Your anomaly detection logic here\n # For example, check if packet size is above a certain threshold\n if packet.len > 1500:\n return 'Anomaly Detected'\n else:\n return 'No Anomaly'\n\ndef process_packets(packets):\n with Pool() as pool:\n results = pool.map(analyze_packet, packets)\n return results\n\n# Capture network traffic\npackets = sniff(count=1000)\n\n# Convert packets to DataFrame for easier analysis\ndf = pd.DataFrame([packet.summary() for packet in packets], columns=['Packet Summary'])\n\n# Process packets in parallel\nresults = process_packets(packets)\n\n# Add results to DataFrame\ndf['Anomaly'] = results\n\nprint(df.head())",,"<string>:1:1: 'from scapy.all import *' used; unable to detect undefined names\n<string>:19:11: 'sniff' may be undefined, or defined from star imports: scapy.all\n"
918,"from scapy.all import *\nfrom sklearn.ensemble import IsolationForest\nfrom multiprocessing import Process, Queue\nimport time\n\n# Define the features we're interested in\nfeatures = [""src"", ""dst"", ""len"", ""ttl"", ""proto""]\n\n# Initialize the detection model\nmodel = IsolationForest(contamination=0.01)\n\n# Function to process packets\ndef process_packet(packet, q):\n data = [packet[f] for f in features]\n q.put(data)\n\n# Function to train the model\ndef train_model(q, model):\n data = []\n while True:\n if not q.empty():\n data.append(q.get())\n if len(data) > 100:\n model.fit(data)\n data = []\n time.sleep(0.1)\n\n# Start capturing packets\npackets = sniff(prn=lambda x: process_packet(x, q), count=100)\n\n# Initialize the queue and the model training process\nq = Queue()\np = Process(target=train_model, args=(q, model))\np.start()\n\n# Wait for the model training process to finish\np.join()",,"<string>:1:1: 'from scapy.all import *' used; unable to detect undefined names\n<string>:29:11: 'sniff' may be undefined, or defined from star imports: scapy.all\n"


In [5]:
# python_codes.to_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/python_codes_with_checks_0927.csv', index=False)
python_codes = pd.read_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/python_codes_with_checks_0927.csv')


In [172]:
# python_codes[python_codes.is_valid_python_with_pyflakes == False][['code', 'pyflakes_error', 'is_valid_python_with_compile']].head(30)

In [191]:
# compile errors
ind = 15
ind = 115
# pyflakes errors
ind = 2 # imported but unused
ind = 69 # assigned to but never used
ind = 36 # undefined name
# mypy errors
ind = 576 # missing positional argument
ind = 743 # unsupported operand types
ind = 545 # has no attribute X
# incomplete code
ind = 261

ind = 509
print(python_codes.prompt[ind])
print('----------\n')
print(python_codes.code[ind])

Expert Python Developer Needed for Autonomous Driving Software: We require an expert Python developer to create a concurrent and parallel processing system that can handle multiple data streams from various sensors (e.g., LIDAR, RADAR, cameras) in real-time. The system should be capable of metaprogramming to adapt to different vehicle models and their unique sensor configurations. The final code will be integrated into our autonomous driving software for the automotive industry. Please provide a detailed solution that demonstrates your expertise in concurrency, parallel processing, and metaprogramming.

### Instructions
    * The code should have a complexity of "Expert: Concurrency, parallel processing, and metaprogramming".
    * Write code that might be used in the "Automotive Software" industry within a "Autonomous Driving" context.
    * Try to include at least 1 of the following Python packages:  `numpy`.
    * Include only the code, without any comments or additional text.

----

In [None]:
python_codes.error_category[(python_codes.is_valid_python_with_mypy == False)].value_counts()

In [174]:
python_codes[(python_codes.is_valid_python_with_mypy == False)][['mypy_error', 'pyflakes_error_category']].head(30)

Unnamed: 0,mypy_error,pyflakes_error_category
13,"<string>:2: [1m[31merror:[m Library stubs not installed for [m[1m""requests""[m [m[33m[import-untyped][m\n<string>:2: [34mnote:[m Hint: [m[1m""python3 -m pip install types-requests""[m[m\n<string>:2: [34mnote:[m (or run [m[1m""mypy --install-types""[m to install all missing stub packages)[m\n<string>:2: [34mnote:[m See [4mhttps://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports[m[m\n[1m[31mFound 1 error in 1 file (checked 1 source file)[m\n",
15,<string>:37: [1m[31merror:[m expected an indented block [m[33m[syntax][m\n[1m[31mFound 1 error in 1 file (errors prevented further checking)[m\n,Invalid Syntax
26,<string>:48: [1m[31merror:[m expected an indented block [m[33m[syntax][m\n[1m[31mFound 1 error in 1 file (errors prevented further checking)[m\n,Invalid Syntax
41,"<string>:2: [1m[31merror:[m Library stubs not installed for [m[1m""requests""[m [m[33m[import-untyped][m\n<string>:2: [34mnote:[m Hint: [m[1m""python3 -m pip install types-requests""[m[m\n<string>:2: [34mnote:[m (or run [m[1m""mypy --install-types""[m to install all missing stub packages)[m\n<string>:2: [34mnote:[m See [4mhttps://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports[m[m\n[1m[31mFound 1 error in 1 file (checked 1 source file)[m\n",imported but unused
69,"<string>:18: [1m[31merror:[m Need type annotation for [m[1m""data_queue""[m [m[33m[var-annotated][m\n[1m[31mFound 1 error in 1 file (checked 1 source file)[m\n",assigned to but never used
72,"<string>:2: [1m[31merror:[m Library stubs not installed for [m[1m""requests""[m [m[33m[import-untyped][m\n<string>:2: [34mnote:[m Hint: [m[1m""python3 -m pip install types-requests""[m[m\n<string>:2: [34mnote:[m (or run [m[1m""mypy --install-types""[m to install all missing stub packages)[m\n<string>:2: [34mnote:[m See [4mhttps://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports[m[m\n[1m[31mFound 1 error in 1 file (checked 1 source file)[m\n",
115,<string>:8: [1m[31merror:[m invalid syntax [m[33m[syntax][m\n[1m[31mFound 1 error in 1 file (errors prevented further checking)[m\n,Invalid Syntax
146,<string>:55: [1m[31merror:[m invalid syntax [m[33m[syntax][m\n[1m[31mFound 1 error in 1 file (errors prevented further checking)[m\n,Invalid Syntax
151,<string>:21: [1m[31merror:[m invalid syntax [m[33m[syntax][m\n[1m[31mFound 1 error in 1 file (errors prevented further checking)[m\n,Invalid Syntax
157,"<string>:35: [1m[31merror:[m Need type annotation for [m[1m""data_queues""[m [m[33m[var-annotated][m\n<string>:36: [1m[31merror:[m Need type annotation for [m[1m""alert_queue""[m [m[33m[var-annotated][m\n[1m[31mFound 2 errors in 1 file (checked 1 source file)[m\n",imported but unused


In [None]:
# python_codes[python_codes.pyflakes_error_category == 'undefined name'][['pyflakes_error', 'mypy_error']]

In [175]:
python_codes['incomplete_code']= python_codes.code.apply(lambda x: '# ...' in x)
print(python_codes.incomplete_code.value_counts())
python_codes[python_codes.incomplete_code == True][['code', 'pyflakes_error']]

incomplete_code
False    968
True      32
Name: count, dtype: int64


Unnamed: 0,code,pyflakes_error
15,"import threading\nimport pandas as pd\nfrom sklearn.ensemble import RandomForestClassifier\n\n# Function to process user sessions in parallel\ndef process_session(session):\n # Analyze user behavior in real-time\n user_behavior = analyze_user_behavior(session)\n\n # Detect signs of potential cart abandonment\n is_abandonment = detect_abandonment(user_behavior)\n\n if is_abandonment:\n # Trigger appropriate interventions (e.g., sending a reminder email or push notification)\n trigger_intervention(session)\n\n# Function to analyze user behavior in real-time\ndef analyze_user_behavior(session):\n # Implement logic to analyze user behavior\n # ...\n\n return user_behavior\n\n# Function to detect signs of potential cart abandonment\ndef detect_abandonment(user_behavior):\n # Implement logic to detect signs of potential cart abandonment\n # ...\n\n return is_abandonment\n\n# Function to trigger appropriate interventions (e.g., sending a remi...",<string>:37:1: expected an indented block\nuser_sessions = pd.read_csv('user_sessions.csv')\n^\n
26,"import pandas as pd\nimport numpy as np\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error\nfrom multiprocessing import Pool\n\n# Preprocess the data\ndef preprocess_data(data):\n # Perform data cleaning, feature engineering, and transformation\n # ...\n\n return processed_data\n\n# Design a concurrent and parallel processing system\ndef process_data_parallel(data_chunks):\n with Pool() as pool:\n processed_data = pool.map(preprocess_data, data_chunks)\n\n return processed_data\n\n# Build the CLV prediction model\ndef build_clv_model(processed_data):\n # Split the data into training and testing sets\n X_train, X_test, y_train, y_test = train_test_split(\n processed_data.drop('CLV', axis=1),\n processed_data['CLV'],\n test_size=0.2,\n random_state=42\n )\n\n # Train a random forest regression model\n model = RandomFor...",<string>:48:1: expected an indented block\nsales_data = pd.read_csv('sales_data.csv')\n^\n
41,"import concurrent.futures\nimport requests\nimport pandas as pd\nimport numpy as np\nfrom sklearn.ensemble import IsolationForest\nfrom sklearn.preprocessing import StandardScaler\n\n# Define a list of network protocols\nprotocols = ['http', 'https', 'ftp', 'ssh', 'smtp']\n\n# Define a function to analyze network traffic for a given protocol\ndef analyze_traffic(protocol):\n # Fetch network traffic data for the given protocol\n data = requests.get(f'https://api.example.com/traffic/{protocol}').json()\n\n # Preprocess the data\n df = pd.DataFrame(data)\n scaler = StandardScaler()\n df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)\n\n # Detect anomalies using Isolation Forest\n clf = IsolationForest(contamination=0.01)\n preds = clf.fit_predict(df)\n anomalies = df[preds == -1]\n\n # Mitigate the detected anomalies\n # ...\n\n return anomalies\n\n# Create a thread pool executor\nwith concurrent.futures.ThreadPoolExecutor() as exec...",<string>:4:1: 'numpy as np' imported but unused\n
69,"import threading\nimport queue\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport scipy.stats as stats\n\n# Define a worker function to process data streams\ndef worker(data_queue):\n while True:\n data = data_queue.get()\n # Perform analytics on the data\n # ...\n # Update network settings based on analytics results\n # ...\n data_queue.task_done()\n\n# Create a queue to hold data streams\ndata_queue = queue.Queue()\n\n# Create and start multiple worker threads\nfor i in range(4):\n t = threading.Thread(target=worker, args=(data_queue,))\n t.start()\n\n# Generate and process synthetic data\nfor _ in range(100):\n data = pd.DataFrame({'signal_strength': stats.norm.rvs(size=1000),\n 'latency': stats.uniform.rvs(size=1000),\n 'packet_loss': stats.binom.rvs(100, 0.05, size=1000)})\n data_queue.put(data)\n\n# Wait for all tasks in the queue to be processed\ndata_queue.joi...",<string>:10:9: local variable 'data' is assigned to but never used\n
103,"import numpy as np\nimport pandas as pd\nimport concurrent.futures\nfrom scipy.optimize import minimize_scalar\n\n# Differential Privacy Mechanism: Laplace Mechanism\ndef laplace_mechanism(data, epsilon):\n noise = np.random.laplace(0, 1 / epsilon, len(data))\n return data + noise\n\n# Process a subset of data\ndef process_subset(subset, epsilon):\n # Add noise to the data\n noisy_data = laplace_mechanism(subset, epsilon)\n # Perform analytics or ML tasks on the noisy data\n # ...\n return result\n\n# Combine results while maintaining privacy\ndef combine_results(results):\n # Perform post-processing on the results to ensure privacy\n # ...\n return combined_result\n\n# Main function to manage processes and combine results\ndef main():\n # Read large dataset\n data = pd.read_csv('large_dataset.csv')\n\n # Split data into subsets for parallel processing\n subsets = np.array_split(data, num_processes)\n\n # Set epsilon value for differenti...",<string>:4:1: 'scipy.optimize.minimize_scalar' imported but unused\n<string>:14:5: local variable 'noisy_data' is assigned to but never used\n<string>:17:12: undefined name 'result'\n<string>:23:12: undefined name 'combined_result'\n<string>:31:36: undefined name 'num_processes'\n<string>:38:74: undefined name 'num_processes'\n<string>:41:5: local variable 'combined_result' is assigned to but never used\n
203,"import asyncio\nimport pandas as pd\nfrom concurrent.futures import ThreadPoolExecutor\n\n# Simulated supplier data feeds\nsupplier_data_feeds = {\n 'Supplier1': 'data1.csv',\n 'Supplier2': 'data2.csv',\n 'Supplier3': 'data3.csv'\n}\n\n# Function to process data from a supplier\ndef process_supplier_data(supplier, file):\n data = pd.read_csv(file)\n # Process data (e.g., update inventory database)\n # ...\n return data\n\n# Function to handle real-time updates from suppliers\nasync def handle_supplier_updates():\n with ThreadPoolExecutor(max_workers=3) as executor:\n tasks = {executor.submit(process_supplier_data, supplier, file): supplier for supplier, file in supplier_data_feeds.items()}\n for future in asyncio.as_completed(tasks):\n supplier = tasks[future]\n try:\n data = await future\n # Process data (e.g., update inventory database)\n # ...\n except Exception as...",<string>:26:17: local variable 'data' is assigned to but never used\n<string>:30:23: f-string is missing placeholders\n
219,"import threading\nimport queue\nimport numpy as np\nfrom sklearn.preprocessing import StandardScaler\nfrom tensorflow.keras.applications import VGG16\nfrom tensorflow.keras.models import Model\n\n# Thread-safe queue for handling video frames\nframe_queue = queue.Queue()\n\n# Function to process video frames\ndef process_frames(stream_id):\n while True:\n frame = frame_queue.get()\n # Process frame here\n # ...\n frame_queue.task_done()\n\n# Load pre-trained CNN model\nbase_model = VGG16(weights='imagenet', include_top=False)\n\n# Define metaprogramming function for dynamic model loading and inference\ndef load_and_infer(model_path):\n # Load custom model\n custom_model = load_model(model_path)\n\n # Create a new model with the custom layers\n input_tensor = base_model.input\n output_tensor = custom_model(base_model.output)\n new_model = Model(inputs=input_tensor, outputs=output_tensor)\n\n return new_model\n\n# Assign each video s...",<string>:3:1: 'numpy as np' imported but unused\n<string>:4:1: 'sklearn.preprocessing.StandardScaler' imported but unused\n<string>:14:9: local variable 'frame' is assigned to but never used\n<string>:25:20: undefined name 'load_model'\n<string>:51:23: undefined name 'preprocess_frame'\n
261,"import requests\nimport pandas as pd\nimport concurrent.futures\n\n# Function to monitor network traffic\ndef monitor_traffic(url):\n response = requests.get(url)\n return response.status_code, url\n\n# Function to analyze network traffic data\ndef analyze_traffic(data):\n df = pd.DataFrame(data, columns=['Status Code', 'URL'])\n\n # Perform analysis using pandas\n # ...\n\n # Return results\n return df\n\n# List of URLs to monitor\nurls = ['https://example.com', 'https://example.org', 'https://example.net']\n\n# Use concurrent.futures to monitor network traffic concurrently\nwith concurrent.futures.ThreadPoolExecutor() as executor:\n results = executor.map(monitor_traffic, urls)\n\n# Analyze the traffic data\ndf = analyze_traffic(results)\n\n# Generate report\nreport = df.to_string(index=False)\nprint(report)",
270,"import pandas as pd\nfrom joblib import Parallel, delayed\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import classification_report\n\n# Load the healthcare data\ndata = pd.read_csv('healthcare_data.csv')\n\n# Preprocessing function\ndef preprocess_data(patient):\n # Data preprocessing steps\n # ...\n return processed_patient\n\n# Feature extraction function\ndef extract_features(patient):\n # Feature extraction steps\n # ...\n return features\n\n# Model training function\ndef train_model(features):\n # Model training steps\n # ...\n return model\n\n# Analyze results function\ndef analyze_results(models):\n # Analyze results steps\n # ...\n return report\n\n# Process the data in parallel\nprocessed_data = Parallel(n_jobs=-1)(delayed(preprocess_data)(patient) for _, patient in data.iterrows())\n\nfeatures = Parallel(n_jobs=-1)(delayed(extract_features)(patient) for pat...",<string>:3:1: 'sklearn.preprocessing.StandardScaler' imported but unused\n<string>:4:1: 'sklearn.ensemble.RandomForestClassifier' imported but unused\n<string>:5:1: 'sklearn.metrics.classification_report' imported but unused\n<string>:14:12: undefined name 'processed_patient'\n<string>:26:12: undefined name 'model'\n
333,"import multiprocessing as mp\nimport numpy as np\nimport sklearn.decomposition as skd\nfrom sklearn.preprocessing import StandardScaler\n\ndef process_image(image):\n # Some image processing tasks like filtering, segmentation, or feature extraction\n # ...\n return processed_image\n\ndef apply_pca(image):\n flattened_image = image.flatten()\n pca = skd.PCA(n_components=2)\n pca.fit(flattened_image)\n return pca.transform(flattened_image)\n\nif __name__ == ""__main__"":\n # Assume we have a list of medical images\n images = [np.random.rand(100, 100) for _ in range(100)]\n\n # Standardize the images\n scaler = StandardScaler()\n images = scaler.fit_transform(images)\n\n # Create a pool of processes\n pool = mp.Pool()\n\n # Apply image processing and PCA in parallel\n processed_images = pool.map(process_image, images)\n pca_images = pool.map(apply_pca, processed_images)\n\n # Close the pool and wait for all processes to finish\n p...",<string>:9:12: undefined name 'processed_image'\n
