In [13]:
import pandas as pd
from pandasai import SmartDataframe
from pandasai.llm import OpenAI
import numpy as np
import sqlite3

In [2]:
llm = OpenAI(api_token="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")

# Setup EvaDB

In [3]:
import evadb

cursor = evadb.connect().cursor()
print("Connected to EvaDB")


Connected to EvaDB


In [4]:
cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute()

<evadb.models.storage.batch.Batch at 0x14fc91590>

In [5]:
create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas
            IMPL  './functions/chat_with_df.py';
            """
cursor.query(create_function_query).execute()
print("Created Function")


Created Function


In [6]:
sql_db = """CREATE DATABASE IF NOT EXISTS sqlite_data WITH ENGINE = 'sqlite', PARAMETERS = {
     "database": "evadb.db"
};"""

cursor.query(sql_db).execute()


<evadb.models.storage.batch.Batch at 0x14f9b9210>

Load into SQLite

In [4]:
import sqlite3
csv_file = 'clean_ml_data/Movie/duplicates/dirty_train.csv'
df = pd.read_csv(csv_file)

database_file = 'evadb.db'
conn = sqlite3.connect(database_file)

table_name = 'DUPL_DATA'
df.to_sql(table_name, conn, if_exists='replace', index=False)

conn.commit()
conn.close()

# Comparison of cleaning performance

## Remove Duplicates

### Baseline

In [7]:
clean_dupl_df = pd.read_csv("clean_ml_data/Movie/duplicates/clean_train.csv")

In [8]:
dirty_dupl_df = pd.read_csv("clean_ml_data/Movie/duplicates/dirty_train.csv")

### PandasAI

In [9]:
pd_dirty_dupl_df = SmartDataframe(dirty_dupl_df, config={"llm": llm})

In [10]:
pd_clean_dupl_df = pd_dirty_dupl_df.clean_data()

### EvaAIDf

In [14]:
database_file = 'evadb.db'
sql_conn = sqlite3.connect(database_file)

sql_cursor = sql_conn.cursor()

table_name = "DUPL_DATA"
sql_cursor.execute(f"PRAGMA table_info({table_name})")
columns = [row[1] for row in sql_cursor.fetchall()]

In [15]:
columns

['title',
 'genres',
 'budget',
 'language',
 'duration',
 'year',
 'vote_count',
 'score']

In [None]:
chat_query = f""" SELECT ChatWithPandas('cleaning', 'remove duplicate rows based on all columns from the dataframe',
            title, genres, budget, language, duration, year, vote_count, score) FROM sqlite_data.DUPL_DATA;
"""


print(chat_query)
result = cursor.query(chat_query).execute()


In [17]:
eva_clean_dupl_df = pd.read_csv("cleaned_df.csv")


In [18]:
len(eva_clean_dupl_df)

6510

### Comparisons

In [19]:
print(f"Length of dirty df: {len(dirty_dupl_df)}")
print(f"Lenght of Ideal cleaned df: {len(clean_dupl_df)} ")
print(f"Length of PandasAI library cleaning: {len(pd_clean_dupl_df)}")
print(f"Length of EvaAI cleaned df: {len(eva_clean_dupl_df)}")

Length of dirty df: 6531
Lenght of Ideal cleaned df: 4419 
Length of PandasAI library cleaning: 6531
Length of EvaAI cleaned df: 6510


In [20]:
tmp = pd.read_csv("clean_ml_data/Movie/duplicates/dirty_train.csv")
tmp.head(2)

Unnamed: 0,title,genres,budget,language,duration,year,vote_count,score
0,Play It to the Bone,1,24000000,en,124,1999,53,5.7
1,Harry Potter and the Prisoner of Azkaban,0,130000000,en,141,2004,5877,7.7


In [21]:
#when manually perform a 
len(tmp.drop_duplicates(subset='title'))

4373

## Missing values

In [23]:
clean_missing_df = pd.read_csv("clean_ml_data/Titanic/missing_values/impute_mean_dummy_train.csv")

In [24]:
dirty_missing_df = pd.read_csv("clean_ml_data/Titanic/missing_values/dirty_train.csv")

### PandasAI

In [25]:
pd_dirty_missing_df = SmartDataframe(dirty_missing_df, config={"llm": llm})

In [26]:
pd_clean_missing_df = pd_dirty_missing_df.impute_missing_values()

### EvaAI

In [27]:
database_file = 'evadb.db'
conn = sqlite3.connect(database_file)

table_name = 'MISSING_DATA'
dirty_missing_df.to_sql(table_name, conn, if_exists='replace', index=False)



624

In [28]:
sql_cursor.execute(f"PRAGMA table_info({table_name})")
columns = [row[1] for row in sql_cursor.fetchall()]

conn.commit()
conn.close()

print(columns)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [29]:
chat_query2 = f""" SELECT ChatWithPandas('cleaning',
    'impute null values with the mean value of the column. compute the mean for each column not for entire dataset. if it is a string column then replace with empty string',
     PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked) FROM sqlite_data.MISSING_DATA;
"""

print(chat_query2)
result2 = cursor.query(chat_query2).execute()


 SELECT ChatWithPandas('cleaning',
    'impute null values with the mean value of the column. compute the mean for each column not for entire dataset. if it is a string column then replace with empty string',
     PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked) FROM sqlite_data.MISSING_DATA;



  self.pd_df = df


In [30]:
eva_clean_missing_df = pd.read_csv("cleaned_df.csv")

### Comparisons

In [None]:
print(f"Nos of null values in original dirty df: {dirty_missing_df.isnull().sum()}")
print(f"Nos of null values in original clean df: {clean_missing_df.isnull().sum()}")
print(f"Nos of null values in pandas ai clean df: {pd_clean_missing_df.isnull().sum()}")
print(f"Nos of null values in eva clean df: {eva_clean_missing_df.isnull().sum()}")

## Outliers

In [33]:
clean_outliers_df = pd.read_csv("clean_ml_data/Airbnb/outliers/clean_SD_impute_mean_dummy_train.csv")

In [34]:
dirty_outliers_df = pd.read_csv("clean_ml_data/Airbnb/outliers/dirty_train.csv")

### PandasAI

In [35]:
pd_dirty_outliers_df = SmartDataframe(dirty_outliers_df, config={"llm": llm})

In [36]:
pd_clean_outliers_df = pd_dirty_outliers_df.chat("Replace values in Price column that are more than 2 std deviations from mean with the mean values")

### EvaAI

In [37]:
database_file = 'evadb.db'
conn = sqlite3.connect(database_file)

table_name = 'OUTLIERS_DATA'
dirty_outliers_df.to_sql(table_name, conn, if_exists='replace', index=False)



18406

In [38]:
sql_cursor.execute(f"PRAGMA table_info({table_name})")
columns = [row[1] for row in sql_cursor.fetchall()]

conn.commit()
conn.close()

print(columns)

['Bathrooms', 'Bedrooms', 'Beds', 'LocationName', 'NumGuests', 'NumReviews', 'Price', 'Rating', 'latitude', 'longitude', 'zipcode', 'pop2016', 'pop2010', 'pop2000', 'cost_living_index (US avg. = 100)', 'land_area (sq.mi.)', 'water_area (sq.mi.)', 'pop_density (people per mile)', 'number of males', 'number of females', 'prop taxes paid 2016', 'median taxes (with mortgage', 'median taxes (no mortgage)', 'median house value', 'median houshold income', 'median monthly owner costs (with mortgage)', 'median monthly owner costs (no mortgage)', 'median gross rent', 'median asking price for vacant for-sale home/condo', 'unemployment (%)', 'Number of Homes', 'Count of Abnb', 'Density of Abnb (%)', 'Average Abnb Price (by zipcode)', 'Average NumReviews (by zipcode)', 'Average Rating (by zipcode)', 'Average Number of Bathrooms (by zipcode)', 'Average Number of Bedrooms (by zipcode)', 'Average Number of Beds (by zipcode)', 'Average Number of Guests (by zipcode)']


In [45]:
chat_query3 = f""" SELECT ChatWithPandas('cleaning',
    'Replace values in Price column that are more than 2 std deviations from mean with the mean values',
     LocationName, Price, Rating, latitude) FROM sqlite_data.OUTLIERS_DATA;
"""

print(chat_query3)
result = cursor.query(chat_query2).execute()


 SELECT ChatWithPandas('cleaning',
    'Replace values in Price column that are more than 2 std deviations from mean with the mean values',
     LocationName, Price, Rating, latitude) FROM sqlite_data.OUTLIERS_DATA;



  self.pd_df = df


<evadb.models.storage.batch.Batch at 0x153563010>

In [46]:
eva_clean_outliers_df = pd.read_csv("cleaned_df.csv")

### Comparison

In [47]:
print(f"Max value in Price: {dirty_outliers_df['Price'].max()}")
print(f"Max value in Price: {clean_outliers_df['Price'].max()}")
print(f"Max value in Price: {pd_clean_outliers_df['Price'].max()}")
print(f"Max value in Price: {eva_clean_outliers_df['Price'].max()}")
# print(f"Nos of null values in pandas ai clean df: {pd_clean_missing_df.isnull().sum()}")
# print(f"Nos of null values in eva clean df: {eva_clean_missing_df.isnull().sum()}")

Max value in Price: 999.0
Max value in Price: nan
Max value in Price: 326.0


KeyError: 'Price'