In [1]:

# Import necessary libraries
import pandas as pd

# Load dataset
# file_path = 'dataset.csv'  # Update this if needed
df = pd.read_csv("dataset.csv")

# Display basic info
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26488 entries, 0 to 26487
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   State           26488 non-null  object
 1   District        26488 non-null  object
 2   Market          26488 non-null  object
 3   Commodity       26488 non-null  object
 4   Variety         26488 non-null  object
 5   Grade           26488 non-null  object
 6   Arrival_Date    26488 non-null  object
 7   Min_Price       26488 non-null  int64 
 8   Max_Price       26488 non-null  int64 
 9   Modal_Price     26488 non-null  int64 
 10  Commodity_Code  26488 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 2.2+ MB


Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_Price,Max_Price,Modal_Price,Commodity_Code
0,Uttar Pradesh,Gautam Budh Nagar,Dadri,Garlic,Garlic,FAQ,09/07/2024,10500,11000,10800,25
1,Uttar Pradesh,Gautam Budh Nagar,Dadri,Garlic,Garlic,FAQ,17/07/2024,11100,11800,11500,25
2,Uttar Pradesh,Gautam Budh Nagar,Dadri,Garlic,Garlic,FAQ,19/07/2024,11200,11750,11500,25
3,Uttar Pradesh,Gautam Budh Nagar,Dadri,Garlic,Garlic,FAQ,22/07/2024,11300,11900,11600,25
4,Uttar Pradesh,Gautam Budh Nagar,Dadri,Wheat,Dara,FAQ,25/07/2024,2400,2550,2475,1


In [None]:

# Check for duplicate rows and remove them if found
df = df.drop_duplicates()
print("Duplicates removed. Current shape:", df.shape)


In [None]:

# Drop the Commodity_Code column (redundant)
if 'Commodity_Code' in df.columns:
    df = df.drop(columns=['Commodity_Code'])
print("Commodity_Code column dropped.")


In [None]:

# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)


In [None]:

# Fill missing price values with median per commodity
price_columns = ['Min_Price', 'Max_Price', 'Modal_Price']

for col in price_columns:
    df[col] = df.groupby('Commodity')[col].transform(lambda x: x.fillna(x.median()))

print("Missing price values filled with median per commodity.")


In [None]:

# Remove outliers using IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

for col in price_columns:
    df = remove_outliers(df, col)

print("Outliers removed from price columns.")


In [None]:

# Save cleaned dataset
cleaned_file_path = '/mnt/data/cleaned_dataset.csv'
df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned dataset saved at {cleaned_file_path}")
