In [1]:
import pandas as pd
from pandasai import SmartDataframe
from pandasai.llm import OpenAI
import numpy as np
import sqlite3

In [19]:
llm = OpenAI(api_token="YOUR API KEY")

# Evaluate performance of EvaAI


In [3]:
import evadb

cursor = evadb.connect().cursor()
print("Connected to EvaDB")


Connected to EvaDB


In [4]:
cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute()

<evadb.models.storage.batch.Batch at 0x1468b8290>

In [5]:
create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas
            IMPL  './functions/chat_with_df.py';
            """
cursor.query(create_function_query).execute()
print("Created Function")


Created Function


In [6]:
sql_db = """CREATE DATABASE IF NOT EXISTS sqlite_data WITH ENGINE = 'sqlite', PARAMETERS = {
     "database": "evadb.db"
};"""

cursor.query(sql_db).execute()


<evadb.models.storage.batch.Batch at 0x1465b4290>

# Comparison of cleaning performance

## Remove Duplicates

### Baseline

In [8]:
clean_dupl_df = pd.read_csv("clean_ml_data/Movie/duplicates/clean_train.csv")

In [9]:
dirty_dupl_df = pd.read_csv("clean_ml_data/Movie/duplicates/dirty_train.csv")

### PandasAI

In [10]:
pd_dirty_dupl_df = SmartDataframe(dirty_dupl_df, config={"llm": llm})

In [11]:
pd_clean_dupl_df = pd_dirty_dupl_df.chat("remove duplicate values based on title")

In [12]:
len(pd_clean_dupl_df)

4373

### EvaAIDf

In [13]:
database_file = 'evadb.db'
sql_conn = sqlite3.connect(database_file)

sql_cursor = sql_conn.cursor()

table_name = "DUPL_DATA"
sql_cursor.execute(f"PRAGMA table_info({table_name})")
columns = [row[1] for row in sql_cursor.fetchall()]

In [14]:
columns

['title',
 'genres',
 'budget',
 'language',
 'duration',
 'year',
 'vote_count',
 'score']

In [15]:
chat_query = f""" SELECT ChatWithPandas('remove duplicate rows based on title',
            title, genres, budget, language, duration, year, vote_count, score) FROM sqlite_data.DUPL_DATA;
"""


result = cursor.query(chat_query).execute()


  self.pd_df = df


In [16]:
eva_clean_dupl_df = pd.read_csv("new_df.csv")


In [17]:
len(eva_clean_dupl_df)

4373

### Comparisons

In [18]:
print(f"Length of dirty df: {len(dirty_dupl_df)}")
print(f"Lenght of Ideal cleaned df: {len(clean_dupl_df)} ")
print(f"Length of PandasAI library cleaning: {len(pd_clean_dupl_df)}")
print(f"Length of EvaAI cleaned df: {len(eva_clean_dupl_df)}")

Length of dirty df: 6531
Lenght of Ideal cleaned df: 4419 
Length of PandasAI library cleaning: 4373
Length of EvaAI cleaned df: 4373


In [19]:
tmp = pd.read_csv("clean_ml_data/Movie/duplicates/dirty_train.csv")
tmp.head(2)

Unnamed: 0,title,genres,budget,language,duration,year,vote_count,score
0,Play It to the Bone,1,24000000,en,124,1999,53,5.7
1,Harry Potter and the Prisoner of Azkaban,0,130000000,en,141,2004,5877,7.7


In [20]:
#when manually performed
len(tmp.drop_duplicates(subset='title'))

4373

Thus EvaAIDataframe's results matches the results produced by PandasAI. It also matches the results of a manual manipulation

## Missing values

In [7]:
clean_missing_df = pd.read_csv("clean_ml_data/Titanic/missing_values/impute_mean_dummy_train.csv")

In [8]:
dirty_missing_df = pd.read_csv("clean_ml_data/Titanic/missing_values/dirty_train.csv")

### PandasAI

In [9]:
pd_dirty_missing_df = SmartDataframe(dirty_missing_df, config={"llm": llm})

In [10]:
pd_clean_missing_df = pd_dirty_missing_df.impute_missing_values()

### EvaAI

In [16]:
database_file = 'evadb.db'
conn = sqlite3.connect(database_file)

table_name = 'MISSING_DATA'
dirty_missing_df.to_sql(table_name, conn, if_exists='replace', index=False)



624

In [17]:
database_file = 'evadb.db'
sql_conn = sqlite3.connect(database_file)

sql_cursor = sql_conn.cursor()

sql_cursor.execute(f"PRAGMA table_info({table_name})")
columns = [row[1] for row in sql_cursor.fetchall()]

conn.commit()
conn.close()

print(columns)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [18]:
columns = [x.lower() for x in columns]

In [19]:
cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute()
create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas
            IMPL  './functions/chat_with_df.py';
            """
cursor.query(create_function_query).execute()
print("Created Function")


Created Function


In [20]:
chat_query2 = f""" SELECT ChatWithPandas('impute null values with the mean value of the column.',
    {', '.join(columns)}) FROM sqlite_data.MISSING_DATA;
"""

print(chat_query2)
result2 = cursor.query(chat_query2).execute()


 SELECT ChatWithPandas('impute null values with the mean value of the column.',
    passengerid, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked) FROM sqlite_data.MISSING_DATA;



  self.pd_df = df


In [21]:
eva_clean_missing_df = pd.read_csv("new_df.csv")

### Comparisons

In [22]:
print(f"Nos of null values in original dirty df: {dirty_missing_df.isnull().sum()}")
print(f"Nos of null values in original clean df: {clean_missing_df.isnull().sum()}")

#pandasAI fills null values with the string 'Unknown'
print(f"Nos of null values in pandas ai clean df: {pd_clean_missing_df.isnull().sum()}")

#evaAI fills null values with empty string. so its counted as null.
print(f"Nos of null values in eva clean df: {eva_clean_missing_df.isnull().sum()}")

Nos of null values in original dirty df: PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            117
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          487
Embarked         2
dtype: int64
Nos of null values in original clean df: PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
Nos of null values in pandas ai clean df: PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
Nos of null values in eva clean df: PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket       

Thus EvaAIDf imputes null values with corresponding values correctly

## Outliers

In [7]:
clean_outliers_df = pd.read_csv("clean_ml_data/Airbnb/outliers/clean_SD_impute_mean_dummy_train.csv")

In [8]:
dirty_outliers_df = pd.read_csv("clean_ml_data/Airbnb/outliers/dirty_train.csv")

### PandasAI

In [9]:
pd_dirty_outliers_df = SmartDataframe(dirty_outliers_df, config={"llm": llm})

In [10]:
pd_clean_outliers_df = pd_dirty_outliers_df.chat("Replace values in Price column that are more than 2 std deviations from mean with the mean values")

### EvaAI

In [13]:
database_file = 'evadb.db'
conn = sqlite3.connect(database_file)
database_file = 'evadb.db'
sql_cursor = conn.cursor()

table_name = 'OUTLIERS_DATA'
dirty_outliers_df.to_sql(table_name, conn, if_exists='replace', index=False)



18406

In [14]:
sql_cursor.execute(f"PRAGMA table_info({table_name})")
columns = [row[1] for row in sql_cursor.fetchall()]

conn.commit()
conn.close()

print(columns)

['Bathrooms', 'Bedrooms', 'Beds', 'LocationName', 'NumGuests', 'NumReviews', 'Price', 'Rating', 'latitude', 'longitude', 'zipcode', 'pop2016', 'pop2010', 'pop2000', 'cost_living_index (US avg. = 100)', 'land_area (sq.mi.)', 'water_area (sq.mi.)', 'pop_density (people per mile)', 'number of males', 'number of females', 'prop taxes paid 2016', 'median taxes (with mortgage', 'median taxes (no mortgage)', 'median house value', 'median houshold income', 'median monthly owner costs (with mortgage)', 'median monthly owner costs (no mortgage)', 'median gross rent', 'median asking price for vacant for-sale home/condo', 'unemployment (%)', 'Number of Homes', 'Count of Abnb', 'Density of Abnb (%)', 'Average Abnb Price (by zipcode)', 'Average NumReviews (by zipcode)', 'Average Rating (by zipcode)', 'Average Number of Bathrooms (by zipcode)', 'Average Number of Bedrooms (by zipcode)', 'Average Number of Beds (by zipcode)', 'Average Number of Guests (by zipcode)']


In [15]:
chat_query3 = f""" SELECT ChatWithPandas('Replace values in Price column that are more than 2 std deviations from mean with the mean values',
     LocationName, Price, Rating, latitude) FROM sqlite_data.OUTLIERS_DATA;
"""

print(chat_query3)
result = cursor.query(chat_query3).execute()


 SELECT ChatWithPandas('Replace values in Price column that are more than 2 std deviations from mean with the mean values',
     LocationName, Price, Rating, latitude) FROM sqlite_data.OUTLIERS_DATA;



  self.pd_df = df


In [16]:
eva_clean_outliers_df = pd.read_csv("new_df.csv")

### Comparison

In [18]:
print(f"Max value in Price: {dirty_outliers_df['Price'].max()}")
print(f"Max value in Price: {clean_outliers_df['Price'].max()}")
print(f"Max value in Price: {pd_clean_outliers_df['Price'].max()}")
print(f"Max value in Price: {eva_clean_outliers_df['price'].max()}")

Max value in Price: 999.0
Max value in Price: nan
Max value in Price: 326.0
Max value in Price: 326.0
