Load pips

In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


Load Dataset

In [145]:
data_set = 'output.csv'

In [146]:
# read data_set
df = pd.read_csv(data_set, low_memory=False)


Data Info

In [147]:
def basic_info(df):
    print("ðŸ”¹ Shape:")
    print(df.shape)

    print("\nðŸ”¹ Info:")
    df.info()

    print("\nðŸ”¹ Dtypes:")
    print(df.dtypes)

    print("\nðŸ”¹ Head:")
    print(df.head(3))

    print("\nðŸ”¹ Tail:")
    print(df.tail(3))

    print("\nðŸ”¹ Columns:")
    print(df.columns.tolist())

    print("\nðŸ”¹ Missing Values (%):")
    print((df.isnull().sum() / len(df)) * 100)

    print("\nðŸ”¹ Total Duplicated Rows:")
    print(df.duplicated().sum())

    print("\nðŸ”¹ Unique Values per Column:")
    print(df.nunique())

    print("\nðŸ”¹ Object Columns Summary:")
    print(df.describe(include='object'))

    print("\nðŸ”¹ Numeric Columns Summary:")
    print(df.describe(include='number'))

    print("\nðŸ”¹ Correlation Matrix (Numeric Features):")
    print(df.corr(numeric_only=True))

    print("\nðŸ”¹ Memory Usage:")
    print(df.memory_usage(deep=True))

    print("\nðŸ”¹ Column-wise Null Count:")
    print(df.isnull().sum())

    print("\nðŸ”¹ Zero Count per Column (Helpful for ML):")
    print((df == 0).sum())

    print("\nðŸ”¹ Sample Random Rows:")
    print(df.sample(3))


In [148]:
basic_info(df)

ðŸ”¹ Shape:
(246091, 18)

ðŸ”¹ Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246091 entries, 0 to 246090
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Area                      246091 non-null  float64
 1   Crop_Year                 246091 non-null  int64  
 2   District_Name             246091 non-null  object 
 3   Production                242361 non-null  float64
 4   Season                    246091 non-null  object 
 5   State_Name                246091 non-null  object 
 6   cation_exchange_capacity  2200 non-null    float64
 7   crop_name                 246091 non-null  object 
 8   humidity                  2200 non-null    float64
 9   nitrogen                  2200 non-null    float64
 10  phosphorus                2200 non-null    float64
 11  potassium                 2200 non-null    float64
 12  rainfall                  2200 non-null    float64
 13  season 

Remove Unwanted col

In [149]:
df.columns


Index(['Area', 'Crop_Year', 'District_Name', 'Production', 'Season',
       'State_Name', 'cation_exchange_capacity', 'crop_name', 'humidity',
       'nitrogen', 'phosphorus', 'potassium', 'rainfall', 'season',
       'soil_moisture', 'soil_ph', 'temperature', 'test'],
      dtype='object')

In [150]:
df['State_Name'].unique()
df.head()

Unnamed: 0,Area,Crop_Year,District_Name,Production,Season,State_Name,cation_exchange_capacity,crop_name,humidity,nitrogen,phosphorus,potassium,rainfall,season,soil_moisture,soil_ph,temperature,test
0,1254.0,2000,NICOBARS,2000.0,Kharif,Andaman and Nicobar Islands,44.0,rice,82.002744,90.0,42.0,43.0,202.935536,Kharif,5.5,6.502985,20.879744,test 1
1,2.0,2000,NICOBARS,1.0,Kharif,Andaman and Nicobar Islands,42.0,rice,80.319644,85.0,58.0,41.0,226.655537,Kharif,5.5,7.038096,21.770462,test 1
2,102.0,2000,NICOBARS,321.0,Kharif,Andaman and Nicobar Islands,33.0,rice,82.320763,60.0,55.0,44.0,263.964248,Kharif,5.5,7.840207,23.004459,test 1
3,176.0,2000,NICOBARS,641.0,Whole Year,Andaman and Nicobar Islands,32.0,rice,80.158363,74.0,35.0,40.0,242.864034,Whole Year,5.5,6.980401,26.491096,test 1
4,720.0,2000,NICOBARS,165.0,Whole Year,Andaman and Nicobar Islands,35.0,rice,81.604873,78.0,42.0,42.0,262.71734,Whole Year,5.5,7.628473,20.130175,test 1


In [151]:
print(df.columns.tolist())


['Area', 'Crop_Year', 'District_Name', 'Production', 'Season', 'State_Name', 'cation_exchange_capacity', 'crop_name', 'humidity', 'nitrogen', 'phosphorus', 'potassium', 'rainfall', 'season', 'soil_moisture', 'soil_ph', 'temperature', 'test']


In [152]:
df.columns = df.columns.str.strip()


In [153]:
states_to_remove = [
    'Andaman and Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh',
    'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh',
    'Dadra and Nagar Haveli', 'Goa', 'Gujarat', 'Haryana',
    'Himachal Pradesh', 'Jammu and Kashmir ', 'Jharkhand', 'Karnataka',
    'Kerala', 'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya',
    'Mizoram', 'Nagaland', 'Odisha', 'Puducherry', 'Punjab', 'Rajasthan',
    'Sikkim', 'Telangana ', 'Tripura', 'Uttar Pradesh', 'Uttarakhand',
    'West Bengal'
]


df = df[~df['State_Name'].isin(states_to_remove)]
df.head()

Unnamed: 0,Area,Crop_Year,District_Name,Production,Season,State_Name,cation_exchange_capacity,crop_name,humidity,nitrogen,phosphorus,potassium,rainfall,season,soil_moisture,soil_ph,temperature,test
177668,24574.0,2008,ARIYALUR,,Kharif,Tamil Nadu,,Rice,,,,,,,,,,area
177669,209.0,2008,ARIYALUR,,Whole Year,Tamil Nadu,,Arhar/Tur,,,,,,,,,,area
177670,565.0,2008,ARIYALUR,,Whole Year,Tamil Nadu,,Bajra,,,,,,,,,,area
177671,190.0,2008,ARIYALUR,,Whole Year,Tamil Nadu,,Banana,,,,,,,,,,area
177672,31113.0,2008,ARIYALUR,,Whole Year,Tamil Nadu,,Cashewnut,,,,,,,,,,area


In [154]:
df['State_Name'].unique()

array(['Tamil Nadu'], dtype=object)

In [155]:
# Remove Crop_Year
df = df.drop(columns=['Area','Crop_Year', 'District_Name','Season', 'Production', 'State_Name',])


In [156]:
df.columns

Index(['cation_exchange_capacity', 'crop_name', 'humidity', 'nitrogen',
       'phosphorus', 'potassium', 'rainfall', 'season', 'soil_moisture',
       'soil_ph', 'temperature', 'test'],
      dtype='object')

data info

In [157]:
df.head()

Unnamed: 0,cation_exchange_capacity,crop_name,humidity,nitrogen,phosphorus,potassium,rainfall,season,soil_moisture,soil_ph,temperature,test
177668,,Rice,,,,,,,,,,area
177669,,Arhar/Tur,,,,,,,,,,area
177670,,Bajra,,,,,,,,,,area
177671,,Banana,,,,,,,,,,area
177672,,Cashewnut,,,,,,,,,,area


In [158]:
df.to_csv('a.csv')