In [1]:
# Note: need to install psycopg2 from source if using in production environment
# https://www.psycopg.org/docs/install.html
# %pip install sqlglot sqlvalidator sqlalchemy psycopg2-binary sqlfluff mysql-connector-python pyodbc
# %pip install pyflakes pylint parso flake8 mypy
# %pip install docker

In [52]:
from importlib import reload
import sql_parsers
reload(sql_parsers)

import python_parsers
reload(python_parsers)

<module 'python_parsers' from '/mnt/foundation-shared/nina_xu_gretel_ai/navigator-helpers/navigator_helpers/steps/code_validation/python_parsers.py'>

In [53]:
import docker
import pandas as pd

from functools import partial

from python_parsers import is_valid_python_with_complie, is_valid_python_with_ast, is_valid_python_with_pyflakes, is_valid_python_with_parso, is_valid_python_with_mypy
from sql_parsers import SimpleSqlValidator, SqliteValidator, PostgresqlValidator, MysqlValidator, SqlserverValidator

In [3]:
# sql_queries = pd.read_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/sql_queries.csv')
sql_queries = pd.read_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/sql_queries_w_dialect_1000.csv')
sql_queries_googlesql = pd.read_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/sql_queries_googlesql_200.csv')
sql_queries = pd.concat([sql_queries, sql_queries_googlesql])
python_typscript_codes = pd.read_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/python_typescript_codes.csv')
python_codes = pd.read_json('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/text_to_python_v1.json')
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 100)

# SQL Code Validation

In [4]:
sql_queries.head(1)
print(sql_queries.Dialect.value_counts())

sql_queries[sql_queries['Dialect'] == 'SQL Server'].head(1)

Dialect
SQL Server            230
PostgreSQL            221
SQLite                209
GoogleSQL             204
MySQL                 197
Oracle SQL             43
OracleSQL              43
Oracle                 37
Oracle SQL Dialect      1
Name: count, dtype: int64


Unnamed: 0,ID,Natural Language Prompt,Context,SQL Query,Domain,Topic,Dialect,Complexity
0,1a2b3c4d,List all students with their respective engagement scores who participated in the online activities in the last month.,"CREATE TABLE Students (StudentID INT PRIMARY KEY, Name NVARCHAR(100));\nCREATE TABLE Engagement (EngagementID INT PRIMARY KEY, StudentID INT, Activity NVARCHAR(100), Score INT, Date DATE, FOREIGN KEY (StudentID) REFERENCES Students(StudentID));","SELECT s.Name, e.Score \nFROM Students s \nJOIN Engagement e ON s.StudentID = e.StudentID \nWHERE e.Date >= DATEADD(month, -1, GETDATE());",Education,Student Engagement,SQL Server,3


In [7]:
"""
Have a PostgreSQL database running in a Docker container. In command line, run the following commands:
# Grant access to non-root users so that the python client will work
> sudo groupadd docker
> sudo usermod -aG docker $USER
> newgrp docker

> docker pull postgres
> docker run --name my-postgres \
  -e POSTGRES_USER=myuser \
  -e POSTGRES_PASSWORD=mypassword \
  -e POSTGRES_DB=mydatabase \
  -p 5433:5432 \
  -d postgres

"""
client = docker.from_env()

# List all running containers
containers = client.containers.list(all=False)
# Get the postgres container
postgres_container = client.containers.get('my-postgres')
# Get container's gateway, not that it's not the "IPAddress" field
postgres_container_gateway = postgres_container.attrs['NetworkSettings']['Gateway']
print(postgres_container_gateway)

postgres_db_creds = {
        "host": postgres_container_gateway,
        "port": 5433, # the default port is 5432, but that was already in use for me
        "user": "myuser",
        "password": "mypassword",
        "dbname": "my-postgres",
    }

172.17.0.1


In [8]:
"""
Have a MySQL database running in a Docker container. In command line, run the following commands:
> docker pull mysql
> docker run --name my-mysql \
  -e MYSQL_ROOT_PASSWORD=myrootpassword \
  -d mysql
"""

mysql_container = client.containers.get('my-mysql')
mysql_container_ip = mysql_container.attrs['NetworkSettings']['IPAddress']
print(mysql_container_ip)

mysql_db_creds = {
    "host": mysql_container_ip,
    "port": 3306, # default port for mysql
    "user": "root",
    "password": "myrootpassword",
}

172.17.0.3


In [9]:
"""
Have a Microsoft SQL Server database running in a Docker container. In command line, run the following commands:
$ docker pull mcr.microsoft.com/mssql/server
$ docker run --name my-sqlserver \
  -e 'ACCEPT_EULA=Y' -e 'MSSQL_SA_PASSWORD=myRoot(!)Password' \
  -p 1433:1433 \
  -d mcr.microsoft.com/mssql/server

$ sudo apt install unixodbc-dev

Install the SQL Server command-line tool (sqlcmd) inside the container:
$ docker exec -it --user root my-sqlserver bash
# apt-get update
# apt-get install -y mssql-tools unixodbc-dev
"""
          
sqlserver_container = client.containers.get('my-sqlserver')
sqlserver_container_ip = sqlserver_container.attrs['NetworkSettings']['IPAddress']
print(sqlserver_container_ip)

sqlserver_db_creds = {
    "host": sqlserver_container_ip,
    "port": 1433, # default port for sql server,
    "user": "sa",
    "password": "myRoot(!)Password",
}

172.17.0.4


In [10]:
#  Apply different SQL validators to the SQL queries
def is_valid_query_and_schema(row, func):
    query_check = func(row['SQL Query'])
    schema_check = func(row['Context'])
    is_valid_schema = schema_check[0]
    is_valid_query = query_check[0]
    is_valid_sql = is_valid_schema and is_valid_query
    error_messages = f"***Schema error: {schema_check[1]}" if not is_valid_schema else ''
    error_messages += f"***Query error: {query_check[1]}" if not is_valid_query else ''
    return is_valid_sql, is_valid_schema, is_valid_query, error_messages

def is_valid_query_and_schema_with_sqlfluff(row):
    dialect_map = {
        'SQLite': 'sqlite',
        'PostgreSQL': 'postgres',
        'MySQL': 'mysql',
        'SQL Server': 'tsql',
        'GoogleSQL': 'bigquery',
        'Oracle': 'oracle',
    }
    if 'Oracle' in row['Dialect']:
        dialect = 'oracle'
    else:
        dialect = dialect_map.get(row['Dialect'], 'ansi')
    query_check = SimpleSqlValidator.is_valid_sql_with_sqlfluff(row['SQL Query'], dialect)
    schema_check = SimpleSqlValidator.is_valid_sql_with_sqlfluff(row['Context'], dialect)
    is_valid_schema = schema_check[0]
    is_valid_query = query_check[0]
    is_valid_sql = is_valid_schema and is_valid_query
    error_messages = f"***Schema error: {schema_check[1]}" if not is_valid_schema else ''
    error_messages += f"***Query error: {query_check[1]}" if not is_valid_query else ''
    return is_valid_sql, is_valid_schema, is_valid_query, error_messages

def check_query_and_schema_separately(sql_queries, method):
    functions_to_apply = {
        'sqlglot': partial(is_valid_query_and_schema, func=SimpleSqlValidator.is_valid_sql_with_sqlglot),
        'sqlquery': partial(is_valid_query_and_schema, func=SimpleSqlValidator.is_valid_sql_with_sqlquery),
        'sqlfluff': is_valid_query_and_schema_with_sqlfluff,
    }

    result = sql_queries.apply(functions_to_apply[method], axis=1).apply(list)
    sql_queries[f'is_valid_sql_with_{method}'] = result.apply(lambda x: x[0])
    sql_queries[f'is_valid_schema_with_{method}'] = result.apply(lambda x: x[1])
    sql_queries[f'is_valid_query_with_{method}'] = result.apply(lambda x: x[2])
    sql_queries[f'error_msgs_{method}'] = result.apply(lambda x: x[3])

    return sql_queries


def check_query_against_schema(row, dialect):

    validator_classes = {
        'SQLite': SqliteValidator,
        'PostgreSQL': PostgresqlValidator,
        'MySQL': MysqlValidator,
        'SQL Server': SqlserverValidator,
    }

    kwargs_postgres = {
        'domain': row['Domain'],
        'db_creds': postgres_db_creds,
    }
    kwargs_mysql = {
        'domain': row['Domain'],
        'db_creds': mysql_db_creds,
        'mysql_container': mysql_container,
    }
    kwargs_sqlserver = {
        'domain': row['Domain'],
        'db_creds': sqlserver_db_creds,
        'sqlserver_container': sqlserver_container,
    }
    all_kwargs = {
        'SQLite': {},
        'PostgreSQL': kwargs_postgres,
        'MySQL': kwargs_mysql,
        'SQL Server': kwargs_sqlserver,
    }

    dialect_name = dialect.lower().replace(' ', '')

    if row['Dialect'] == dialect:
        result = validator_classes[dialect].is_valid_sql(
            row['SQL Query'], row['Context'], **all_kwargs[dialect]
            )
    else:
        result = None, None
    
    row[f'is_valid_{dialect_name}'] = result[0]
    row[f'error_msg_{dialect_name}'] = result[1]
    
    return row


sql_queries = check_query_and_schema_separately(sql_queries, 'sqlfluff')
sql_queries = check_query_and_schema_separately(sql_queries, 'sqlglot')
sql_queries = check_query_and_schema_separately(sql_queries, 'sqlquery')

sql_queries = sql_queries.apply(check_query_against_schema, dialect='SQLite', axis=1)
sql_queries = sql_queries.apply(check_query_against_schema, dialect='PostgreSQL', axis=1)
sql_queries = sql_queries.apply(check_query_against_schema, dialect='MySQL', axis=1)
sql_queries = sql_queries.apply(check_query_against_schema, dialect='SQL Server', axis=1)

In [56]:
sql_queries.to_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/sql_queries_1200_validated.csv', index=False)

In [57]:

print(sql_queries.is_valid_sql_with_sqlglot.value_counts(normalize=True))
print(sql_queries.is_valid_sql_with_sqlquery.value_counts(normalize=True))
print(sql_queries.is_valid_sql_with_sqlfluff.value_counts(normalize=True))

print(sql_queries.is_valid_sqlite.value_counts(normalize=True))
print(sql_queries.is_valid_postgresql.value_counts(normalize=True))
print(sql_queries.is_valid_mysql.value_counts(normalize=True))
print(sql_queries.is_valid_sqlserver.value_counts(normalize=True))


is_valid_sql_with_sqlglot
True     0.968776
False    0.031224
Name: proportion, dtype: float64
is_valid_sql_with_sqlquery
True     0.993249
False    0.006751
Name: proportion, dtype: float64
is_valid_sql_with_sqlfluff
True     0.945992
False    0.054008
Name: proportion, dtype: float64
is_valid_sqlite
True     0.985646
False    0.014354
Name: proportion, dtype: float64
is_valid_postgresql
True     0.895928
False    0.104072
Name: proportion, dtype: float64
is_valid_mysql
True     0.939086
False    0.060914
Name: proportion, dtype: float64
is_valid_sqlserver
True     0.86087
False    0.13913
Name: proportion, dtype: float64


In [14]:
dialects = ['SQLite', 'PostgreSQL', 'MySQL', 'SQL Server', 'GoogleSQL']
for dialect in dialects:
    print(f"***{dialect}***")
    print(sql_queries[sql_queries['Dialect'] == dialect].is_valid_sql_with_sqlglot.value_counts(normalize=True))
    print(sql_queries[sql_queries['Dialect'] == dialect].is_valid_sql_with_sqlquery.value_counts(normalize=True))
    print(sql_queries[sql_queries['Dialect'] == dialect].is_valid_sql_with_sqlfluff.value_counts(normalize=True))

***SQLite***
is_valid_sql_with_sqlglot
True     0.990431
False    0.009569
Name: proportion, dtype: float64
is_valid_sql_with_sqlquery
True    1.0
Name: proportion, dtype: float64
is_valid_sql_with_sqlfluff
True     0.990431
False    0.009569
Name: proportion, dtype: float64
***PostgreSQL***
is_valid_sql_with_sqlglot
True     0.986425
False    0.013575
Name: proportion, dtype: float64
is_valid_sql_with_sqlquery
True     0.99095
False    0.00905
Name: proportion, dtype: float64
is_valid_sql_with_sqlfluff
True     0.918552
False    0.081448
Name: proportion, dtype: float64
***MySQL***
is_valid_sql_with_sqlglot
True    1.0
Name: proportion, dtype: float64
is_valid_sql_with_sqlquery
True     0.989848
False    0.010152
Name: proportion, dtype: float64
is_valid_sql_with_sqlfluff
True     0.969543
False    0.030457
Name: proportion, dtype: float64
***SQL Server***
is_valid_sql_with_sqlglot
True     0.886957
False    0.113043
Name: proportion, dtype: float64
is_valid_sql_with_sqlquery
True    

In [16]:
methods = ['sqlglot', 'sqlquery', 'sqlfluff']
for dialect in dialects:
    print(f"***{dialect}***")
    for method in methods:
        print(f"***{method}***")
        print(sql_queries[sql_queries['Dialect'] == dialect][f'is_valid_sql_with_{method}'].value_counts(normalize=True))
        print(sql_queries[sql_queries['Dialect'] == dialect][f'is_valid_schema_with_{method}'].value_counts(normalize=True))
        print(sql_queries[sql_queries['Dialect'] == dialect][f'is_valid_query_with_{method}'].value_counts(normalize=True))


***SQLite***
***sqlglot***
is_valid_sql_with_sqlglot
True     0.990431
False    0.009569
Name: proportion, dtype: float64
is_valid_schema_with_sqlglot
True     0.995215
False    0.004785
Name: proportion, dtype: float64
is_valid_query_with_sqlglot
True     0.995215
False    0.004785
Name: proportion, dtype: float64
***sqlquery***
is_valid_sql_with_sqlquery
True    1.0
Name: proportion, dtype: float64
is_valid_schema_with_sqlquery
True    1.0
Name: proportion, dtype: float64
is_valid_query_with_sqlquery
True    1.0
Name: proportion, dtype: float64
***sqlfluff***
is_valid_sql_with_sqlfluff
True     0.990431
False    0.009569
Name: proportion, dtype: float64
is_valid_schema_with_sqlfluff
True     0.995215
False    0.004785
Name: proportion, dtype: float64
is_valid_query_with_sqlfluff
True     0.995215
False    0.004785
Name: proportion, dtype: float64
***PostgreSQL***
***sqlglot***
is_valid_sql_with_sqlglot
True     0.986425
False    0.013575
Name: proportion, dtype: float64
is_valid_sche

In [38]:
# Check if the query is valid with both sqlglot and sqlfluff
# SQLQuery is proven to be useless so not counting it in the aggregate
sql_queries['is_valid_sql_aggregate'] = sql_queries[['is_valid_sql_with_sqlglot', 'is_valid_sql_with_sqlfluff']].all(axis=1)
for dialect in dialects:
    print(f"***{dialect}***")
    print(sql_queries[sql_queries['Dialect'] == dialect].is_valid_sql_aggregate.value_counts(normalize=True))

***SQLite***
is_valid_sql_aggregate
True     0.990431
False    0.009569
Name: proportion, dtype: float64
***PostgreSQL***
is_valid_sql_aggregate
True     0.914027
False    0.085973
Name: proportion, dtype: float64
***MySQL***
is_valid_sql_aggregate
True     0.969543
False    0.030457
Name: proportion, dtype: float64
***SQL Server***
is_valid_sql_aggregate
True     0.882609
False    0.117391
Name: proportion, dtype: float64
***GoogleSQL***
is_valid_sql_aggregate
True     0.823529
False    0.176471
Name: proportion, dtype: float64


In [59]:
# What are the differences between checking against schema and validating the query separately from schema?
for dialect in dialects[:-1]:
    print(f"***{dialect}***")
    dialect_name = dialect.lower().replace(' ', '')
    df = sql_queries[sql_queries['Dialect'] == dialect]
    print(pd.crosstab(df[f'is_valid_{dialect_name}'], df['is_valid_sql_with_sqlfluff']))

***SQLite***
is_valid_sql_with_sqlfluff  False  True 
is_valid_sqlite                         
False                           1      2
True                            1    205
***PostgreSQL***
is_valid_sql_with_sqlfluff  False  True 
is_valid_postgresql                     
False                          18      5
True                            0    198
***MySQL***
is_valid_sql_with_sqlfluff  False  True 
is_valid_mysql                          
False                           6      6
True                            0    185
***SQL Server***
is_valid_sql_with_sqlfluff  False  True 
is_valid_sqlserver                      
False                           1     31
True                            0    198


In [51]:
dialect = dialects[1]
dialect_name = dialect.lower().replace(' ', '')
print(dialect)
df = sql_queries[(sql_queries['Dialect'] == dialect) & 
                 ((sql_queries['is_valid_sql_aggregate'] == False) | 
                  (sql_queries[f'is_valid_{dialect_name}'] == False))]
df[['SQL Query', 'Context', f'error_msg_{dialect_name}', 'error_msgs_sqlfluff', 'error_msgs_sqlquery', 'error_msgs_sqlglot']]

PostgreSQL


Unnamed: 0,SQL Query,Context,error_msg_postgresql,error_msgs_sqlfluff,error_msgs_sqlquery,error_msgs_sqlglot
63,"SELECT AVG(HeartRate) AS AverageHeartRate FROM Patients WHERE Age > 60 AND VisitDate BETWEEN ADD_MONTHS(TRUNC(SYSDATE, 'MM'), -1) AND LAST_DAY(ADD_MONTHS(TRUNC(SYSDATE, 'MM'), -1));","CREATE TABLE Patients (\n PatientID INT PRIMARY KEY,\n Name VARCHAR2(255),\n Age INT,\n VisitDate DATE,\n HeartRate INT\n);","(psycopg2.errors.UndefinedObject) type ""varchar2"" does not exist\nLINE 3: Name VARCHAR2(255),\n ^\n\n[SQL: CREATE TABLE Patients (\n PatientID INT PRIMARY KEY,\n Name VARCHAR2(255),\n Age INT,\n VisitDate DATE,\n HeartRate INT\n);]\n(Background on this error at: http://sqlalche.me/e/f405)","***Schema error: PRS: Line 1, Position 1: Found unparsable section: 'CREATE TABLE Patients (\n PatientID IN...'",,
136,"SELECT p.name, AVG(r.rating) AS average_rating FROM products p JOIN reviews r ON p.id = r.product_id WHERE p.category = 'electronics' GROUP BY p.name;","CREATE TABLE customers (id NUMBER PRIMARY KEY, name VARCHAR2(50), email VARCHAR2(100)); CREATE TABLE products (id NUMBER PRIMARY KEY, name VARCHAR2(100), category VARCHAR2(50)); CREATE TABLE reviews (id NUMBER PRIMARY KEY, customer_id NUMBER, product_id NUMBER, rating NUMBER, review TEXT, CONSTRAINT fk_customer FOREIGN KEY (customer_id) REFERENCES customers(id), CONSTRAINT fk_product FOREIGN KEY (product_id) REFERENCES products(id));","(psycopg2.errors.UndefinedObject) type ""number"" does not exist\nLINE 1: CREATE TABLE customers (id NUMBER PRIMARY KEY, name VARCHAR2...\n ^\n\n[SQL: CREATE TABLE customers (id NUMBER PRIMARY KEY, name VARCHAR2(50), email VARCHAR2(100)); CREATE TABLE products (id NUMBER PRIMARY KEY, name VARCHAR2(100), category VARCHAR2(50)); CREATE TABLE reviews (id NUMBER PRIMARY KEY, customer_id NUMBER, product_id NUMBER, rating NUMBER, review TEXT, CONSTRAINT fk_customer FOREIGN KEY (customer_id) REFERENCES customers(id), CONSTRAINT fk_product FOREIGN KEY (product_id) REFERENCES products(id));]\n(Background on this error at: http://sqlalche.me/e/f405)","***Schema error: PRS: Line 1, Position 1: Found unparsable section: 'CREATE TABLE customers (id NUMBER PRIMAR...'",,
152,"SELECT p.product_name, p.price FROM products p JOIN inventory i ON p.product_id = i.product_id WHERE p.available = 'true' AND i.quantity > 0;","CREATE TABLE products (\n product_id VARCHAR2(50) PRIMARY KEY,\n product_name VARCHAR2(255),\n price NUMBER(10, 2),\n available BOOLEAN\n);\n\nCREATE TABLE inventory (\n inventory_id VARCHAR2(50) PRIMARY KEY,\n product_id VARCHAR2(50) REFERENCES products(product_id),\n quantity NUMBER\n);","(psycopg2.errors.UndefinedObject) type ""varchar2"" does not exist\nLINE 2: product_id VARCHAR2(50) PRIMARY KEY,\n ^\n\n[SQL: CREATE TABLE products (\n product_id VARCHAR2(50) PRIMARY KEY,\n product_name VARCHAR2(255),\n price NUMBER(10, 2),\n available BOOLEAN\n);\n\nCREATE TABLE inventory (\n inventory_id VARCHAR2(50) PRIMARY KEY,\n product_id VARCHAR2(50) REFERENCES products(product_id),\n quantity NUMBER\n);]\n(Background on this error at: http://sqlalche.me/e/f405)","***Schema error: PRS: Line 1, Position 1: Found unparsable section: 'CREATE TABLE products (\n product_id V...'",,
155,"SELECT d.name, SUM(s.quantity_sold) as total_sales FROM sales s JOIN drugs d ON s.drug_id = d.id WHERE DATE_PART('month', s.sale_date) = DATE_PART('month', CURRENT_DATE) AND DATE_PART('year', s.sale_date) = DATE_PART('year', CURRENT_DATE) GROUP BY d.name;","CREATE TABLE sales (id SERIAL PRIMARY KEY, drug_id INTEGER REFERENCES drugs(id), quantity_sold INTEGER, sale_date DATE);\nCREATE TABLE drugs (id SERIAL PRIMARY KEY, name VARCHAR(255) NOT NULL);","(psycopg2.errors.UndefinedTable) relation ""drugs"" does not exist\n\n[SQL: CREATE TABLE sales (id SERIAL PRIMARY KEY, drug_id INTEGER REFERENCES drugs(id), quantity_sold INTEGER, sale_date DATE);\nCREATE TABLE drugs (id SERIAL PRIMARY KEY, name VARCHAR(255) NOT NULL);]\n(Background on this error at: http://sqlalche.me/e/f405)",,,
206,"SELECT City, AVG(BroadbandSpeed) AS AverageSpeed FROM BroadbandData GROUP BY City;","CREATE TABLE BroadbandData (\n City VARCHAR2(100),\n BroadbandSpeed NUMBER\n);\n","(psycopg2.errors.UndefinedObject) type ""varchar2"" does not exist\nLINE 2: City VARCHAR2(100),\n ^\n\n[SQL: CREATE TABLE BroadbandData (\n City VARCHAR2(100),\n BroadbandSpeed NUMBER\n);\n]\n(Background on this error at: http://sqlalche.me/e/f405)","***Schema error: PRS: Line 1, Position 1: Found unparsable section: 'CREATE TABLE BroadbandData (\n City VA...'",,
232,"SELECT p.PropertyType, SUM(i.Amount) AS TotalInvestment FROM Investments i JOIN Properties p ON i.PropertyID = p.PropertyID GROUP BY p.PropertyType;","CREATE TABLE Investments (InvestmentID VARCHAR2(20), InvestorID VARCHAR2(20), PropertyID VARCHAR2(20), Amount NUMBER(10, 2), InvestmentDate DATE); CREATE TABLE Properties (PropertyID VARCHAR2(20), PropertyType VARCHAR2(50), Address VARCHAR2(255));","(psycopg2.errors.UndefinedObject) type ""varchar2"" does not exist\nLINE 1: CREATE TABLE Investments (InvestmentID VARCHAR2(20), Investo...\n ^\n\n[SQL: CREATE TABLE Investments (InvestmentID VARCHAR2(20), InvestorID VARCHAR2(20), PropertyID VARCHAR2(20), Amount NUMBER(10, 2), InvestmentDate DATE); CREATE TABLE Properties (PropertyID VARCHAR2(20), PropertyType VARCHAR2(50), Address VARCHAR2(255));]\n(Background on this error at: http://sqlalche.me/e/f405)","***Schema error: PRS: Line 1, Position 1: Found unparsable section: 'CREATE TABLE Investments (InvestmentID V...'",,
261,SELECT P.name FROM Patients P JOIN Appointments A ON P.patient_id = A.patient_id JOIN Doctors D ON A.doctor_id = D.doctor_id WHERE D.name = 'Dr. Smith' AND appointment_date BETWEEN CURRENT_DATE AND (CURRENT_DATE + INTERVAL '7 days');,"CREATE TABLE Patients (patient_id SERIAL PRIMARY KEY, name VARCHAR(100), age INT, gender CHAR(1)); CREATE TABLE Appointments (appointment_id SERIAL PRIMARY KEY, patient_id INT, doctor_id INT, appointment_date DATE, FOREIGN KEY (patient_id) REFERENCES Patients(patient_id), FOREIGN KEY (doctor_id) REFERENCES Doctors(doctor_id)); CREATE TABLE Doctors (doctor_id SERIAL PRIMARY KEY, name VARCHAR(100), specialty VARCHAR(100));","(psycopg2.errors.UndefinedTable) relation ""doctors"" does not exist\n\n[SQL: CREATE TABLE Patients (patient_id SERIAL PRIMARY KEY, name VARCHAR(100), age INT, gender CHAR(1)); CREATE TABLE Appointments (appointment_id SERIAL PRIMARY KEY, patient_id INT, doctor_id INT, appointment_date DATE, FOREIGN KEY (patient_id) REFERENCES Patients(patient_id), FOREIGN KEY (doctor_id) REFERENCES Doctors(doctor_id)); CREATE TABLE Doctors (doctor_id SERIAL PRIMARY KEY, name VARCHAR(100), specialty VARCHAR(100));]\n(Background on this error at: http://sqlalche.me/e/f405)",,,
298,SELECT product_name FROM products WHERE in_stock = TRUE AND price > 50;,"CREATE TABLE products (product_id NUMBER PRIMARY KEY, product_name VARCHAR2(255), price NUMBER, in_stock BOOLEAN);","(psycopg2.errors.UndefinedObject) type ""number"" does not exist\nLINE 1: CREATE TABLE products (product_id NUMBER PRIMARY KEY, produc...\n ^\n\n[SQL: CREATE TABLE products (product_id NUMBER PRIMARY KEY, product_name VARCHAR2(255), price NUMBER, in_stock BOOLEAN);]\n(Background on this error at: http://sqlalche.me/e/f405)","***Schema error: PRS: Line 1, Position 1: Found unparsable section: 'CREATE TABLE products (product_id NUMBER...'",,
311,SELECT AVG(grade) AS average_grade FROM Students WHERE course_name = 'Data Science 101';,"CREATE TABLE Students (\n student_id VARCHAR2(50),\n student_name VARCHAR2(100),\n grade NUMBER(3, 2),\n course_name VARCHAR2(100)\n);\n\nCREATE TABLE Courses (\n course_id VARCHAR2(50),\n course_name VARCHAR2(100),\n instructor VARCHAR2(100)\n);","(psycopg2.errors.UndefinedObject) type ""varchar2"" does not exist\nLINE 2: student_id VARCHAR2(50),\n ^\n\n[SQL: CREATE TABLE Students (\n student_id VARCHAR2(50),\n student_name VARCHAR2(100),\n grade NUMBER(3, 2),\n course_name VARCHAR2(100)\n);\n\nCREATE TABLE Courses (\n course_id VARCHAR2(50),\n course_name VARCHAR2(100),\n instructor VARCHAR2(100)\n);]\n(Background on this error at: http://sqlalche.me/e/f405)","***Schema error: PRS: Line 1, Position 1: Found unparsable section: 'CREATE TABLE Students (\n student_id V...'",,
340,"SELECT COUNT(*)\nFROM CustomerComplaints\nWHERE DateReceived >= ADD_MONTHS(SYSDATE, -1);","CREATE TABLE CustomerComplaints (\n ComplaintID INT PRIMARY KEY,\n CustomerID INT,\n DateReceived DATE,\n ComplaintText VARCHAR2(4000),\n Status VARCHAR2(50)\n);","(psycopg2.errors.UndefinedObject) type ""varchar2"" does not exist\nLINE 5: ComplaintText VARCHAR2(4000),\n ^\n\n[SQL: CREATE TABLE CustomerComplaints (\n ComplaintID INT PRIMARY KEY,\n CustomerID INT,\n DateReceived DATE,\n ComplaintText VARCHAR2(4000),\n Status VARCHAR2(50)\n);]\n(Background on this error at: http://sqlalche.me/e/f405)","***Schema error: PRS: Line 1, Position 1: Found unparsable section: 'CREATE TABLE CustomerComplaints (\n Co...'",,


In [49]:
print(sql_queries['Context'].loc[5])

5    CREATE TABLE Properties (PropertyID INT PRIMARY KEY, Address NVARCHAR(255), OwnerID INT); CREATE TABLE Rentals (RentalID INT PRIMARY KEY, PropertyID INT FOREIGN KEY REFERENCES Properties(PropertyID), TenantID INT, RentAmount DECIMAL(10,2), RentDate DATE); CREATE TABLE Owners (OwnerID INT PRIMARY KEY, OwnerName NVARCHAR(255));
5                                                                                                                                                                                                                       CREATE TABLE Machines (\n  machine_id SERIAL PRIMARY KEY,\n  machine_name VARCHAR(255),\n  last_active_date DATE\n);
Name: Context, dtype: object


# Python Code Validation

In [None]:
python_codes.head(1)

In [None]:
def check_code_with_method(df, method='compile', language='python'):
    if language == 'python':
        methods = python_check_methods
    elif language == 'typescript':
        methods = typescript_check_methods
    elif language == 'sql':
        raise ValueError('SQL not supported as it requires a schema as input')
    else:
        raise ValueError('language not supported')
    func = methods[method]
    df[f'check_{method}'] = df['code'].apply(func)
    df[f'is_valid_{language}_with_{method}'] = df[f'check_{method}'].apply(lambda x: x[0])
    df[f'{method}_error'] = df[f'check_{method}'].apply(lambda x: x[1])
    return df

In [166]:
python_check_methods = {
    'compile': is_valid_python_with_complie,
    'ast': is_valid_python_with_ast,
    'pyflakes': is_valid_python_with_pyflakes,
    'parso': is_valid_python_with_parso,
    'mypy': is_valid_python_with_mypy
}

for method in python_check_methods.keys():
    python_codes = check_code_with_method(python_codes, method)
    print(python_codes[f'is_valid_python_with_{method}'].value_counts())

is_valid_python_with_compile
True     976
False     24
Name: count, dtype: int64
is_valid_python_with_ast
True     976
False     24
Name: count, dtype: int64
is_valid_python_with_pyflakes
True     667
False    333
Name: count, dtype: int64
is_valid_python_with_parso
True    1000
Name: count, dtype: int64
is_valid_python_with_mypy
True     913
False     87
Name: count, dtype: int64


In [178]:
python_codes[python_codes.is_valid_python_with_compile == False][['code', 'compile_error', 'ast_error', 'is_valid_python_with_pyflakes']].head(10)

Unnamed: 0,code,compile_error,ast_error,is_valid_python_with_pyflakes
15,"import threading\nimport pandas as pd\nfrom sklearn.ensemble import RandomForestClassifier\n\n# Function to process user sessions in parallel\ndef process_session(session):\n # Analyze user behavior in real-time\n user_behavior = analyze_user_behavior(session)\n\n # Detect signs of potential cart abandonment\n is_abandonment = detect_abandonment(user_behavior)\n\n if is_abandonment:\n # Trigger appropriate interventions (e.g., sending a reminder email or push notification)\n trigger_intervention(session)\n\n# Function to analyze user behavior in real-time\ndef analyze_user_behavior(session):\n # Implement logic to analyze user behavior\n # ...\n\n return user_behavior\n\n# Function to detect signs of potential cart abandonment\ndef detect_abandonment(user_behavior):\n # Implement logic to detect signs of potential cart abandonment\n # ...\n\n return is_abandonment\n\n# Function to trigger appropriate interventions (e.g., sending a remi...","expected an indented block (<string>, line 37)","expected an indented block (<unknown>, line 37)",False
26,"import pandas as pd\nimport numpy as np\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error\nfrom multiprocessing import Pool\n\n# Preprocess the data\ndef preprocess_data(data):\n # Perform data cleaning, feature engineering, and transformation\n # ...\n\n return processed_data\n\n# Design a concurrent and parallel processing system\ndef process_data_parallel(data_chunks):\n with Pool() as pool:\n processed_data = pool.map(preprocess_data, data_chunks)\n\n return processed_data\n\n# Build the CLV prediction model\ndef build_clv_model(processed_data):\n # Split the data into training and testing sets\n X_train, X_test, y_train, y_test = train_test_split(\n processed_data.drop('CLV', axis=1),\n processed_data['CLV'],\n test_size=0.2,\n random_state=42\n )\n\n # Train a random forest regression model\n model = RandomFor...","expected an indented block (<string>, line 48)","expected an indented block (<unknown>, line 48)",False
115,"import asyncio\n\nasync def ingest_data(spacecraft):\n # Connect to spacecraft and start streaming data\n while True:\n data = await spacecraft.receive_data()\n yield data\n```\n\n2. **Data Processing**: Use `pandas` for data processing and analysis. This will allow us to handle large volumes of data efficiently and perform various operations on it.\n\n```python\nimport pandas as pd\n\nasync def process_data(data):\n # Convert data to pandas DataFrame\n df = pd.DataFrame(data)\n\n # Perform various operations on DataFrame (filtering, aggregation, etc.)\n processed_df = df.groupby('timestamp').mean()\n\n return processed_df\n```\n\n3. **Data Analysis**: Use `seaborn` or `matplotlib` for data visualization. This will allow us to gain insights from our data.\n\n```python\nimport seaborn as sns\n\nasync def analyze_data(processed_df):\n # Generate a heatmap of the data\n sns.heatmap(processed_df.corr())\n```\n\n4. **Error Handling**: Implement e...","invalid syntax (<string>, line 8)","invalid syntax (<unknown>, line 8)",False
146,"import numpy as np\nimport matplotlib.pyplot as plt\n\ndef q_learning(state_space, action_space, reward_function, learning_rate, discount_factor, num_episodes, epsilon=0.1):\n # Initialize Q-table with zeros\n q_table = np.zeros((state_space, action_space))\n\n # Initialize list to store learning curve\n learning_curve = []\n\n # Loop through episodes\n for episode in range(num_episodes):\n # Reset state at beginning of episode\n state = 0\n\n # Initialize total reward for episode\n total_reward = 0\n\n # Loop through steps in episode\n for step in range(100):\n # Select action based on epsilon-greedy policy\n if np.random.uniform() < epsilon:\n action = np.random.choice(action_space)\n else:\n action = np.argmax(q_table[state])\n\n # Receive reward and next state from environment\n reward = reward_function(state, action)\n next...","invalid syntax (<string>, line 55)","invalid syntax (<unknown>, line 55)",False
151,"import sys\n\ndef count_error_code(log_file_path, error_code):\n count = 0\n with open(log_file_path, 'r') as file:\n for line in file:\n if error_code in line:\n count += 1\n return count\n\nif __name__ == ""__main__"":\n if len(sys.argv) != 3:\n print(""Usage: python script.py <log_file_path> <error_code>"")\n sys.exit(1)\n\n log_file_path = sys.argv[1]\n error_code = sys.argv[2]\n\n error_code_count = count_error_code(log_file_path, error_code)\n print(f""The count of error code '{error_code}' in the log file is: {error_code_count}"")\n```\n\nTo use this script, save it as `count_error_code.py` and run it using the command:\n\n```bash\npython count_error_code.py <log_file_path> <error_code>","invalid syntax (<string>, line 21)","invalid syntax (<unknown>, line 21)",False
160,"import numpy as np\n\nclass MRI_Scanner:\n def __init__(self, patient_id, scan_date, scan_type):\n self.patient_id = patient_id\n self.scan_date = scan_date\n self.scan_type = scan_type\n\n def simulate_scanning(self):\n try:\n if not isinstance(self.patient_id, int) or self.patient_id <= 0:\n raise ValueError(""Invalid patient ID"")\n\n if not isinstance(self.scan_date, str) or len(self.scan_date.split(""-"")) != 3:\n raise ValueError(""Incorrect scan date format. Expected format: YYYY-MM-DD"")\n\n if self.scan_type not in [""T1"", ""T2"", ""FLAIR""]:\n raise ValueError(""Unsupported scan type. Expected values: T1, T2, FLAIR"")\n\n np.random.seed(hash(self.patient_id) ^ hash(self.scan_date) ^ hash(self.scan_type))\n synthetic_image = np.random.randint(0, 256, (128, 128, 128))\n\n return synthetic_image\n except ValueError as e:\n retu...","invalid syntax (<string>, line 28)","invalid syntax (<unknown>, line 28)",False
163,"class VirtualClassroom:\n def __init__(self, teacher=None, students=None, subject=None, room_capacity=0):\n self.teacher = teacher\n self.students = students if students else []\n self.subject = subject\n self.room_capacity = room_capacity\n\n def add_student(self, student):\n if len(self.students) >= self.room_capacity:\n raise Exception(""Room capacity exceeded"")\n self.students.append(student)\n\n def remove_student(self, student):\n if student in self.students:\n self.students.remove(student)\n\n def assign_teacher(self, teacher):\n if self.teacher:\n raise Exception(""Teacher already assigned"")\n self.teacher = teacher\n\n def change_subject(self, subject):\n self.subject = subject\n\n def display_classroom_details(self):\n print(f""Teacher: {self.teacher}"")\n print(f""Students: {', '.join(self.students)}"")\n print(f""Subject: {self.subject}""...","invalid syntax (<string>, line 30)","invalid syntax (<unknown>, line 30)",False
229,"import pandas as pd\nimport random\n\ndef quiz_generator(questions_file_path):\n try:\n # Read the CSV file using pandas\n df = pd.read_csv(questions_file_path)\n\n # Check if the file is empty\n if df.empty:\n return ""Error: The file is empty.""\n\n # Filter the questions by difficulty level\n easy_questions = df[df['difficulty'] == 'easy']\n medium_questions = df[df['difficulty'] == 'medium']\n hard_questions = df[df['difficulty'] == 'hard']\n\n # Check if there are at least 3 questions of each difficulty level\n if len(easy_questions) < 3 or len(medium_questions) < 3 or len(hard_questions) < 3:\n return ""Error: There should be at least 3 questions of each difficulty level.""\n\n # Randomly select 3 questions of each difficulty level\n selected_easy = easy_questions.sample(3)\n selected_medium = medium_questions.sample(3)\n selected_hard = hard_questions.sample...","invalid syntax (<string>, line 36)","invalid syntax (<unknown>, line 36)",False
243,"import numpy as np\nfrom PIL import Image, ImageFilter\n\nclass ImageProcessor:\n def __init__(self, image_path):\n self.image_path = image_path\n try:\n self.image = Image.open(image_path)\n except Exception as e:\n print(f""Error opening image: {e}"")\n\n def to_grayscale(self):\n try:\n self.image = self.image.convert('L')\n except Exception as e:\n print(f""Error converting to grayscale: {e}"")\n\n def resize(self, size):\n try:\n self.image = self.image.resize(size)\n except Exception as e:\n print(f""Error resizing image: {e}"")\n\n def apply_gaussian_blur(self, radius=5):\n try:\n self.image = self.image.filter(ImageFilter.GaussianBlur(radius))\n except Exception as e:\n print(f""Error applying Gaussian blur: {e}"")\n\n def get_image_data(self):\n try:\n return np.asarray(self.image)\n except Ex...","invalid syntax (<string>, line 35)","invalid syntax (<unknown>, line 35)",False
292,"To design a Python-based concurrent game engine for a multiplayer online battle arena (MOBA) game, we can utilize parallel processing for efficient real-time simulation of multiple player actions and metaprogramming for dynamic game object creation. The engine should include features such as network synchronization, collision detection, and physics simulation.\n\nHere is an overview of the design:\n\n1. Libraries and Frameworks:\n - `asyncio`: To handle concurrent processing for network synchronization.\n - `numpy`: For efficient numerical operations and physics simulation.\n - `pygame`: For creating the graphical user interface (GUI) and handling user input.\n - `tensorflow`: If needed for machine learning algorithms or predictions.\n\n2. High-Level Overview of the Design:\n - Game Objects: Represent players, characters, buildings, towers, and other game objects.\n - Game World: Manages the game state, including the list of game objects, terrain, and physics simulation...","invalid syntax (<string>, line 1)","invalid syntax (<unknown>, line 1)",False


In [None]:
python_codes[(python_codes.is_valid_python_with_complie == True) & (python_codes.is_valid_python_with_pyflakes == False)][['code', 'pyflakes_error']].head(10)

In [None]:
get_error_category('pandas is imported but unused')

In [168]:

def get_error_category(pyflakes_error: str) -> str:
    pyflakes_error_categories = ['undefined name', 'assigned to but never used', 'imported but unused']
    for category in pyflakes_error_categories:
        if pyflakes_error is not None:
            if category in str(pyflakes_error):
                return category
    return None

python_codes['pyflakes_error_category'] = python_codes['pyflakes_error'].apply(get_error_category)
python_codes['pyflakes_error_category'][python_codes.is_valid_python_with_compile == False] = 'Invalid Syntax'
python_codes['pyflakes_error_category'][(python_codes.is_valid_python_with_pyflakes == False) & (python_codes.pyflakes_error_category.isnull())] = 'Other'

python_codes['pyflakes_error_category'].value_counts()


pyflakes_error_category
imported but unused           226
undefined name                 53
assigned to but never used     29
Invalid Syntax                 24
Other                           1
Name: count, dtype: int64

In [None]:
python_codes.to_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/python_codes_with_checks.csv', index=False)
# python_codes = pd.read_csv('/mnt/foundation-shared/nina_xu_gretel_ai/datasets/python_codes_with_checks.csv')


In [172]:
# python_codes[python_codes.is_valid_python_with_pyflakes == False][['code', 'pyflakes_error', 'is_valid_python_with_compile']].head(30)

In [191]:
# compile errors
ind = 15
ind = 115
# pyflakes errors
ind = 2 # imported but unused
ind = 69 # assigned to but never used
ind = 36 # undefined name
# mypy errors
ind = 576 # missing positional argument
ind = 743 # unsupported operand types
ind = 545 # has no attribute X
# incomplete code
ind = 261

ind = 509
print(python_codes.prompt[ind])
print('----------\n')
print(python_codes.code[ind])

Expert Python Developer Needed for Autonomous Driving Software: We require an expert Python developer to create a concurrent and parallel processing system that can handle multiple data streams from various sensors (e.g., LIDAR, RADAR, cameras) in real-time. The system should be capable of metaprogramming to adapt to different vehicle models and their unique sensor configurations. The final code will be integrated into our autonomous driving software for the automotive industry. Please provide a detailed solution that demonstrates your expertise in concurrency, parallel processing, and metaprogramming.

### Instructions
    * The code should have a complexity of "Expert: Concurrency, parallel processing, and metaprogramming".
    * Write code that might be used in the "Automotive Software" industry within a "Autonomous Driving" context.
    * Try to include at least 1 of the following Python packages:  `numpy`.
    * Include only the code, without any comments or additional text.

----

In [None]:
python_codes.error_category[(python_codes.is_valid_python_with_mypy == False)].value_counts()

In [174]:
python_codes[(python_codes.is_valid_python_with_mypy == False)][['mypy_error', 'pyflakes_error_category']].head(30)

Unnamed: 0,mypy_error,pyflakes_error_category
13,"<string>:2: [1m[31merror:[m Library stubs not installed for [m[1m""requests""[m [m[33m[import-untyped][m\n<string>:2: [34mnote:[m Hint: [m[1m""python3 -m pip install types-requests""[m[m\n<string>:2: [34mnote:[m (or run [m[1m""mypy --install-types""[m to install all missing stub packages)[m\n<string>:2: [34mnote:[m See [4mhttps://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports[m[m\n[1m[31mFound 1 error in 1 file (checked 1 source file)[m\n",
15,<string>:37: [1m[31merror:[m expected an indented block [m[33m[syntax][m\n[1m[31mFound 1 error in 1 file (errors prevented further checking)[m\n,Invalid Syntax
26,<string>:48: [1m[31merror:[m expected an indented block [m[33m[syntax][m\n[1m[31mFound 1 error in 1 file (errors prevented further checking)[m\n,Invalid Syntax
41,"<string>:2: [1m[31merror:[m Library stubs not installed for [m[1m""requests""[m [m[33m[import-untyped][m\n<string>:2: [34mnote:[m Hint: [m[1m""python3 -m pip install types-requests""[m[m\n<string>:2: [34mnote:[m (or run [m[1m""mypy --install-types""[m to install all missing stub packages)[m\n<string>:2: [34mnote:[m See [4mhttps://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports[m[m\n[1m[31mFound 1 error in 1 file (checked 1 source file)[m\n",imported but unused
69,"<string>:18: [1m[31merror:[m Need type annotation for [m[1m""data_queue""[m [m[33m[var-annotated][m\n[1m[31mFound 1 error in 1 file (checked 1 source file)[m\n",assigned to but never used
72,"<string>:2: [1m[31merror:[m Library stubs not installed for [m[1m""requests""[m [m[33m[import-untyped][m\n<string>:2: [34mnote:[m Hint: [m[1m""python3 -m pip install types-requests""[m[m\n<string>:2: [34mnote:[m (or run [m[1m""mypy --install-types""[m to install all missing stub packages)[m\n<string>:2: [34mnote:[m See [4mhttps://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports[m[m\n[1m[31mFound 1 error in 1 file (checked 1 source file)[m\n",
115,<string>:8: [1m[31merror:[m invalid syntax [m[33m[syntax][m\n[1m[31mFound 1 error in 1 file (errors prevented further checking)[m\n,Invalid Syntax
146,<string>:55: [1m[31merror:[m invalid syntax [m[33m[syntax][m\n[1m[31mFound 1 error in 1 file (errors prevented further checking)[m\n,Invalid Syntax
151,<string>:21: [1m[31merror:[m invalid syntax [m[33m[syntax][m\n[1m[31mFound 1 error in 1 file (errors prevented further checking)[m\n,Invalid Syntax
157,"<string>:35: [1m[31merror:[m Need type annotation for [m[1m""data_queues""[m [m[33m[var-annotated][m\n<string>:36: [1m[31merror:[m Need type annotation for [m[1m""alert_queue""[m [m[33m[var-annotated][m\n[1m[31mFound 2 errors in 1 file (checked 1 source file)[m\n",imported but unused


In [None]:
# python_codes[python_codes.pyflakes_error_category == 'undefined name'][['pyflakes_error', 'mypy_error']]

In [175]:
python_codes['incomplete_code']= python_codes.code.apply(lambda x: '# ...' in x)
print(python_codes.incomplete_code.value_counts())
python_codes[python_codes.incomplete_code == True][['code', 'pyflakes_error']]

incomplete_code
False    968
True      32
Name: count, dtype: int64


Unnamed: 0,code,pyflakes_error
15,"import threading\nimport pandas as pd\nfrom sklearn.ensemble import RandomForestClassifier\n\n# Function to process user sessions in parallel\ndef process_session(session):\n # Analyze user behavior in real-time\n user_behavior = analyze_user_behavior(session)\n\n # Detect signs of potential cart abandonment\n is_abandonment = detect_abandonment(user_behavior)\n\n if is_abandonment:\n # Trigger appropriate interventions (e.g., sending a reminder email or push notification)\n trigger_intervention(session)\n\n# Function to analyze user behavior in real-time\ndef analyze_user_behavior(session):\n # Implement logic to analyze user behavior\n # ...\n\n return user_behavior\n\n# Function to detect signs of potential cart abandonment\ndef detect_abandonment(user_behavior):\n # Implement logic to detect signs of potential cart abandonment\n # ...\n\n return is_abandonment\n\n# Function to trigger appropriate interventions (e.g., sending a remi...",<string>:37:1: expected an indented block\nuser_sessions = pd.read_csv('user_sessions.csv')\n^\n
26,"import pandas as pd\nimport numpy as np\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import mean_squared_error\nfrom multiprocessing import Pool\n\n# Preprocess the data\ndef preprocess_data(data):\n # Perform data cleaning, feature engineering, and transformation\n # ...\n\n return processed_data\n\n# Design a concurrent and parallel processing system\ndef process_data_parallel(data_chunks):\n with Pool() as pool:\n processed_data = pool.map(preprocess_data, data_chunks)\n\n return processed_data\n\n# Build the CLV prediction model\ndef build_clv_model(processed_data):\n # Split the data into training and testing sets\n X_train, X_test, y_train, y_test = train_test_split(\n processed_data.drop('CLV', axis=1),\n processed_data['CLV'],\n test_size=0.2,\n random_state=42\n )\n\n # Train a random forest regression model\n model = RandomFor...",<string>:48:1: expected an indented block\nsales_data = pd.read_csv('sales_data.csv')\n^\n
41,"import concurrent.futures\nimport requests\nimport pandas as pd\nimport numpy as np\nfrom sklearn.ensemble import IsolationForest\nfrom sklearn.preprocessing import StandardScaler\n\n# Define a list of network protocols\nprotocols = ['http', 'https', 'ftp', 'ssh', 'smtp']\n\n# Define a function to analyze network traffic for a given protocol\ndef analyze_traffic(protocol):\n # Fetch network traffic data for the given protocol\n data = requests.get(f'https://api.example.com/traffic/{protocol}').json()\n\n # Preprocess the data\n df = pd.DataFrame(data)\n scaler = StandardScaler()\n df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)\n\n # Detect anomalies using Isolation Forest\n clf = IsolationForest(contamination=0.01)\n preds = clf.fit_predict(df)\n anomalies = df[preds == -1]\n\n # Mitigate the detected anomalies\n # ...\n\n return anomalies\n\n# Create a thread pool executor\nwith concurrent.futures.ThreadPoolExecutor() as exec...",<string>:4:1: 'numpy as np' imported but unused\n
69,"import threading\nimport queue\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport scipy.stats as stats\n\n# Define a worker function to process data streams\ndef worker(data_queue):\n while True:\n data = data_queue.get()\n # Perform analytics on the data\n # ...\n # Update network settings based on analytics results\n # ...\n data_queue.task_done()\n\n# Create a queue to hold data streams\ndata_queue = queue.Queue()\n\n# Create and start multiple worker threads\nfor i in range(4):\n t = threading.Thread(target=worker, args=(data_queue,))\n t.start()\n\n# Generate and process synthetic data\nfor _ in range(100):\n data = pd.DataFrame({'signal_strength': stats.norm.rvs(size=1000),\n 'latency': stats.uniform.rvs(size=1000),\n 'packet_loss': stats.binom.rvs(100, 0.05, size=1000)})\n data_queue.put(data)\n\n# Wait for all tasks in the queue to be processed\ndata_queue.joi...",<string>:10:9: local variable 'data' is assigned to but never used\n
103,"import numpy as np\nimport pandas as pd\nimport concurrent.futures\nfrom scipy.optimize import minimize_scalar\n\n# Differential Privacy Mechanism: Laplace Mechanism\ndef laplace_mechanism(data, epsilon):\n noise = np.random.laplace(0, 1 / epsilon, len(data))\n return data + noise\n\n# Process a subset of data\ndef process_subset(subset, epsilon):\n # Add noise to the data\n noisy_data = laplace_mechanism(subset, epsilon)\n # Perform analytics or ML tasks on the noisy data\n # ...\n return result\n\n# Combine results while maintaining privacy\ndef combine_results(results):\n # Perform post-processing on the results to ensure privacy\n # ...\n return combined_result\n\n# Main function to manage processes and combine results\ndef main():\n # Read large dataset\n data = pd.read_csv('large_dataset.csv')\n\n # Split data into subsets for parallel processing\n subsets = np.array_split(data, num_processes)\n\n # Set epsilon value for differenti...",<string>:4:1: 'scipy.optimize.minimize_scalar' imported but unused\n<string>:14:5: local variable 'noisy_data' is assigned to but never used\n<string>:17:12: undefined name 'result'\n<string>:23:12: undefined name 'combined_result'\n<string>:31:36: undefined name 'num_processes'\n<string>:38:74: undefined name 'num_processes'\n<string>:41:5: local variable 'combined_result' is assigned to but never used\n
203,"import asyncio\nimport pandas as pd\nfrom concurrent.futures import ThreadPoolExecutor\n\n# Simulated supplier data feeds\nsupplier_data_feeds = {\n 'Supplier1': 'data1.csv',\n 'Supplier2': 'data2.csv',\n 'Supplier3': 'data3.csv'\n}\n\n# Function to process data from a supplier\ndef process_supplier_data(supplier, file):\n data = pd.read_csv(file)\n # Process data (e.g., update inventory database)\n # ...\n return data\n\n# Function to handle real-time updates from suppliers\nasync def handle_supplier_updates():\n with ThreadPoolExecutor(max_workers=3) as executor:\n tasks = {executor.submit(process_supplier_data, supplier, file): supplier for supplier, file in supplier_data_feeds.items()}\n for future in asyncio.as_completed(tasks):\n supplier = tasks[future]\n try:\n data = await future\n # Process data (e.g., update inventory database)\n # ...\n except Exception as...",<string>:26:17: local variable 'data' is assigned to but never used\n<string>:30:23: f-string is missing placeholders\n
219,"import threading\nimport queue\nimport numpy as np\nfrom sklearn.preprocessing import StandardScaler\nfrom tensorflow.keras.applications import VGG16\nfrom tensorflow.keras.models import Model\n\n# Thread-safe queue for handling video frames\nframe_queue = queue.Queue()\n\n# Function to process video frames\ndef process_frames(stream_id):\n while True:\n frame = frame_queue.get()\n # Process frame here\n # ...\n frame_queue.task_done()\n\n# Load pre-trained CNN model\nbase_model = VGG16(weights='imagenet', include_top=False)\n\n# Define metaprogramming function for dynamic model loading and inference\ndef load_and_infer(model_path):\n # Load custom model\n custom_model = load_model(model_path)\n\n # Create a new model with the custom layers\n input_tensor = base_model.input\n output_tensor = custom_model(base_model.output)\n new_model = Model(inputs=input_tensor, outputs=output_tensor)\n\n return new_model\n\n# Assign each video s...",<string>:3:1: 'numpy as np' imported but unused\n<string>:4:1: 'sklearn.preprocessing.StandardScaler' imported but unused\n<string>:14:9: local variable 'frame' is assigned to but never used\n<string>:25:20: undefined name 'load_model'\n<string>:51:23: undefined name 'preprocess_frame'\n
261,"import requests\nimport pandas as pd\nimport concurrent.futures\n\n# Function to monitor network traffic\ndef monitor_traffic(url):\n response = requests.get(url)\n return response.status_code, url\n\n# Function to analyze network traffic data\ndef analyze_traffic(data):\n df = pd.DataFrame(data, columns=['Status Code', 'URL'])\n\n # Perform analysis using pandas\n # ...\n\n # Return results\n return df\n\n# List of URLs to monitor\nurls = ['https://example.com', 'https://example.org', 'https://example.net']\n\n# Use concurrent.futures to monitor network traffic concurrently\nwith concurrent.futures.ThreadPoolExecutor() as executor:\n results = executor.map(monitor_traffic, urls)\n\n# Analyze the traffic data\ndf = analyze_traffic(results)\n\n# Generate report\nreport = df.to_string(index=False)\nprint(report)",
270,"import pandas as pd\nfrom joblib import Parallel, delayed\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.metrics import classification_report\n\n# Load the healthcare data\ndata = pd.read_csv('healthcare_data.csv')\n\n# Preprocessing function\ndef preprocess_data(patient):\n # Data preprocessing steps\n # ...\n return processed_patient\n\n# Feature extraction function\ndef extract_features(patient):\n # Feature extraction steps\n # ...\n return features\n\n# Model training function\ndef train_model(features):\n # Model training steps\n # ...\n return model\n\n# Analyze results function\ndef analyze_results(models):\n # Analyze results steps\n # ...\n return report\n\n# Process the data in parallel\nprocessed_data = Parallel(n_jobs=-1)(delayed(preprocess_data)(patient) for _, patient in data.iterrows())\n\nfeatures = Parallel(n_jobs=-1)(delayed(extract_features)(patient) for pat...",<string>:3:1: 'sklearn.preprocessing.StandardScaler' imported but unused\n<string>:4:1: 'sklearn.ensemble.RandomForestClassifier' imported but unused\n<string>:5:1: 'sklearn.metrics.classification_report' imported but unused\n<string>:14:12: undefined name 'processed_patient'\n<string>:26:12: undefined name 'model'\n
333,"import multiprocessing as mp\nimport numpy as np\nimport sklearn.decomposition as skd\nfrom sklearn.preprocessing import StandardScaler\n\ndef process_image(image):\n # Some image processing tasks like filtering, segmentation, or feature extraction\n # ...\n return processed_image\n\ndef apply_pca(image):\n flattened_image = image.flatten()\n pca = skd.PCA(n_components=2)\n pca.fit(flattened_image)\n return pca.transform(flattened_image)\n\nif __name__ == ""__main__"":\n # Assume we have a list of medical images\n images = [np.random.rand(100, 100) for _ in range(100)]\n\n # Standardize the images\n scaler = StandardScaler()\n images = scaler.fit_transform(images)\n\n # Create a pool of processes\n pool = mp.Pool()\n\n # Apply image processing and PCA in parallel\n processed_images = pool.map(process_image, images)\n pca_images = pool.map(apply_pca, processed_images)\n\n # Close the pool and wait for all processes to finish\n p...",<string>:9:12: undefined name 'processed_image'\n


In [None]:
def get_code_quality_score(row):
    if row['is_valid_python_with_compile'] == False:
        return 0
    if row['pyflakes_error_category'] == 'undefined name':
        return 1
    if row['incomplete_code']:
        return 1
    if row['pyflakes_error_category'] == 'assigned to but never used':
        return 2
    if row['pyflakes_error_category'] == 'imported but unused':
        return 3
    else:
        return 4

python_codes['code_quality_score'] = python_codes.apply(get_code_quality_score, axis=1)

In [None]:
python_codes.groupby('complexity').code_quality_score.mean()  
# No correlation between complexity and code quality score

In [None]:
python_codes.code_quality_score.value_counts()