In [1]:
# Import neccessary libraries
import pandas as pd

In [13]:
# Load the raw rainfall CSV from the data/raw/ directory
rainfall_df = pd.read_csv('../data/raw/Daily Rainfall at State Level_Filtered_Data.csv', skipinitialspace=True)

In [None]:
# Preview 5 rows of the dataset to understand its structure and content
rainfall_df.head()

Unnamed: 0,date,state_code,state_name,actual_rainfall,Rainfall Storage (rfs),normal_rainfall,deviation
0,1/20/2009,"""5""","""Uttarakhand""",0.0,0.0,2.61,-100
1,1/20/2009,"""18""","""Assam""",0.0,0.0,0.62,-100
2,1/20/2009,"""16""","""Tripura""",0.0,0.0,0.0,""""""
3,1/20/2009,"""36""","""Telangana""",0.0,0.0,0.17,-100
4,1/20/2009,"""2""","""Himachal Pradesh""",0.53,1.024662,4.46,-88.12


In [21]:
# Remove quotes from 'state_code' and 'state_name' columns

rainfall_df['state_name'] = rainfall_df['state_name'].astype(str).str.replace('"', '').str.strip()
rainfall_df['state_code'] = rainfall_df['state_code'].astype(str).str.replace('"', '').str.strip()

In [22]:
# Convert 'date' column to datetime
rainfall_df['date'] = pd.to_datetime(rainfall_df['date'], errors='coerce')

In [24]:
# Convert numerical columns to float, coercing invalid strings to NaN
numeric_cols = ['actual_rainfall', 'Rainfall Storage (rfs)', 'normal_rainfall', 'deviation']
for col in numeric_cols:
    rainfall_df[col] = pd.to_numeric(rainfall_df[col], errors='coerce')

In [30]:
# Drop rows with invalid or missing date or state_name (optional but recommended)
rainfall_df.dropna(subset=['date', 'state_name', 'deviation'], inplace=True)

In [31]:
# Reset index after cleaning
rainfall_df.reset_index(drop=True, inplace=True)

In [40]:
# Dropping Rainfall Storage column as it is not needed for analysis (rfs)
rainfall_df.drop(columns=['Rainfall Storage (rfs)'], inplace=True)

In [43]:
# Save cleaned rainfall data to processed folder
rainfall_df.to_csv('../data/processed/Processed_Rainfall_Data.csv', index=False)

In [44]:
# Preview cleaned data
print("Cleaned Rainfall Data Preview:")

rainfall_df.loc[0:10]

Cleaned Rainfall Data Preview:


Unnamed: 0,date,state_code,state_name,actual_rainfall,normal_rainfall,deviation
0,2009-01-20,5,Uttarakhand,0.0,2.61,-100.0
1,2009-01-20,18,Assam,0.0,0.62,-100.0
2,2009-01-20,36,Telangana,0.0,0.17,-100.0
3,2009-01-20,2,Himachal Pradesh,0.53,4.46,-88.12
4,2009-01-20,1,Jammu And Kashmir,0.0,3.55,-100.0
5,2009-01-20,17,Meghalaya,0.0,0.21,-100.0
6,2009-01-20,27,Maharashtra,0.0,0.15,-100.0
7,2009-01-20,3,Punjab,0.0,1.19,-100.0
8,2009-01-20,20,Jharkhand,0.0,0.69,-100.0
9,2009-01-20,28,Andhra Pradesh,0.0,0.07,-100.0
