In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Concatenate
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('city_hour.csv')
df['Datetime'] = pd.to_datetime(df['Datetime'])

In [3]:
df['Month'] = df['Datetime'].dt.month
df['Year'] = df['Datetime'].dt.year
df['Hour'] = df['Datetime'].dt.hour
df['Minute'] = df['Datetime'].dt.minute

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707875 entries, 0 to 707874
Data columns (total 20 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   City        707875 non-null  object        
 1   Datetime    707875 non-null  datetime64[ns]
 2   PM2.5       562787 non-null  float64       
 3   PM10        411138 non-null  float64       
 4   NO          591243 non-null  float64       
 5   NO2         590753 non-null  float64       
 6   NOx         584651 non-null  float64       
 7   NH3         435333 non-null  float64       
 8   CO          621358 non-null  float64       
 9   SO2         577502 non-null  float64       
 10  O3          578667 non-null  float64       
 11  Benzene     544229 non-null  float64       
 12  Toluene     487268 non-null  float64       
 13  Xylene      252046 non-null  float64       
 14  AQI         578795 non-null  float64       
 15  AQI_Bucket  578795 non-null  object        
 16  Mo

In [5]:
# Count null values in each column
null_count = df.isnull().sum()
print(null_count)


City               0
Datetime           0
PM2.5         145088
PM10          296737
NO            116632
NO2           117122
NOx           123224
NH3           272542
CO             86517
SO2           130373
O3            129208
Benzene       163646
Toluene       220607
Xylene        455829
AQI           129080
AQI_Bucket    129080
Month              0
Year               0
Hour               0
Minute             0
dtype: int64


In [6]:
df = df.drop(columns=[ 'NH3', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket'], axis=1)


In [7]:
cities = df['City'].unique()
print("Unique Cities:", cities)

Unique Cities: ['Ahmedabad' 'Aizawl' 'Amaravati' 'Amritsar' 'Bengaluru' 'Bhopal'
 'Brajrajnagar' 'Chandigarh' 'Chennai' 'Coimbatore' 'Delhi' 'Ernakulam'
 'Gurugram' 'Guwahati' 'Hyderabad' 'Jaipur' 'Jorapokhar' 'Kochi' 'Kolkata'
 'Lucknow' 'Mumbai' 'Patna' 'Shillong' 'Talcher' 'Thiruvananthapuram'
 'Visakhapatnam']


In [8]:
pollutants = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']


# Function to clean data for each city
def clean_city_data(city_df, threshold=4):  # Change threshold based on desired pollutant count
    # Calculate the total number of rows for this city
    total_rows = city_df.shape[0]
    
    # Replace zero values with NaN for easier filtering
    city_df.replace(0, pd.NA, inplace=True)
    
    # Count number of non-NaN pollutant values per row
    city_df['valid_pollutants'] = city_df[pollutants].notna().sum(axis=1)
    
    # Filter rows with too few valid pollutant values (e.g., less than 'threshold')
    city_cleaned_df = city_df[city_df['valid_pollutants'] >= threshold]
    
    # Ensure no more than half of the rows are removed
    max_removal = total_rows // 2  # Half the total number of rows
    removed_rows = total_rows - city_cleaned_df.shape[0]
    
    # If too many rows are removed, adjust by selecting rows randomly
    if removed_rows > max_removal:
        print(f"Too many rows removed for {city_df['City'].iloc[0]}. Retaining maximum allowed rows.")
        city_cleaned_df = city_df.sample(frac=0.5)  # Retain only 50% of the rows
    
    return city_cleaned_df

# 2. Loop through each city and clean its data
cleaned_dataframes = []

for city in cities:
    city_df = df[df['City'] == city].copy()  # Filter rows for the current city
    cleaned_city_df = clean_city_data(city_df)  # Clean the data for this city
    cleaned_dataframes.append(cleaned_city_df)  # Store the cleaned data

# Concatenate the cleaned data back into a single DataFrame
df_cleaned = pd.concat(cleaned_dataframes)

# View the final cleaned DataFrame
print(df_cleaned)

Too many rows removed for Mumbai. Retaining maximum allowed rows.
                 City            Datetime  PM2.5   PM10    NO    NO2    NOx   
0           Ahmedabad 2015-01-01 01:00:00    NaN    NaN  1.00  40.01  36.37  \
1           Ahmedabad 2015-01-01 02:00:00    NaN    NaN  0.02  27.75  19.73   
2           Ahmedabad 2015-01-01 03:00:00    NaN    NaN  0.08  19.32  11.08   
3           Ahmedabad 2015-01-01 04:00:00    NaN    NaN  0.30  16.45    9.2   
4           Ahmedabad 2015-01-01 05:00:00    NaN    NaN  0.12  14.90   7.85   
...               ...                 ...    ...    ...   ...    ...    ...   
707870  Visakhapatnam 2020-06-30 20:00:00   9.50  36.00  2.75  25.57  15.85   
707871  Visakhapatnam 2020-06-30 21:00:00  17.25  49.25  3.62  33.20  20.62   
707872  Visakhapatnam 2020-06-30 22:00:00  36.00  71.00  2.20  30.80   18.2   
707873  Visakhapatnam 2020-06-30 23:00:00  15.75  63.00  1.02  28.90   16.0   
707874  Visakhapatnam 2020-07-01 00:00:00  15.00  66.00  0.40  26

In [9]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 609308 entries, 0 to 707874
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   City              609308 non-null  object        
 1   Datetime          609308 non-null  datetime64[ns]
 2   PM2.5             551735 non-null  float64       
 3   PM10              400444 non-null  float64       
 4   NO                582173 non-null  float64       
 5   NO2               581185 non-null  float64       
 6   NOx               545230 non-null  object        
 7   CO                544214 non-null  object        
 8   SO2               567350 non-null  float64       
 9   O3                567904 non-null  float64       
 10  Benzene           414722 non-null  object        
 11  Month             609308 non-null  int32         
 12  Year              609308 non-null  int32         
 13  Hour              584006 non-null  object        
 14  Minute   

In [10]:
# List of columns to fill NaN values
columns_to_fill = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']

# Fill NaN values for the specified columns without affecting the Date column
df_cleaned[columns_to_fill] = df_cleaned[columns_to_fill].fillna(df_cleaned[columns_to_fill].mean())

# Ensure the Date column remains intact
print(df_cleaned.head())  # Check if Date column is still present


        City            Datetime      PM2.5        PM10    NO    NO2    NOx   
0  Ahmedabad 2015-01-01 01:00:00  68.204801  119.589988  1.00  40.01  36.37  \
1  Ahmedabad 2015-01-01 02:00:00  68.204801  119.589988  0.02  27.75  19.73   
2  Ahmedabad 2015-01-01 03:00:00  68.204801  119.589988  0.08  19.32  11.08   
3  Ahmedabad 2015-01-01 04:00:00  68.204801  119.589988  0.30  16.45   9.20   
4  Ahmedabad 2015-01-01 05:00:00  68.204801  119.589988  0.12  14.90   7.85   

     CO     SO2          O3   Benzene  Month  Year Hour Minute   
0  1.00  122.07   34.878862  3.984604      1  2015    1    NaN  \
1  0.02   85.90   34.878862  3.984604      1  2015    2    NaN   
2  0.08   52.83   34.878862  3.984604      1  2015    3    NaN   
3  0.30   39.53  153.580000  3.984604      1  2015    4    NaN   
4  0.12   32.63   34.878862  3.984604      1  2015    5    NaN   

   valid_pollutants  
0                 5  
1                 5  
2                 5  
3                 6  
4                 

In [12]:
# Define features
time_features = ['Month', 'Year', 'Hour', 'Minute']
air_quality_features = ['PM2.5','PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'Benzene']



In [13]:
# Prepare the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), time_features + air_quality_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['City']) # Change 'sparse' to 'sparse_output'
    ])

In [14]:
# Fit the preprocessor and transform the data
X = preprocessor.fit_transform(df)

In [15]:
# Prepare sequences
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:(i + seq_length)])
        y.append(data[i + seq_length, -len(air_quality_features):])  # Only predict air quality features
    return np.array(X), np.array(y)

seq_length = 24  # Use 24 hours of data to predict the next hour
X_seq, y_seq = create_sequences(X, seq_length)

In [16]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

In [17]:
# Build the LSTM model
input_shape = X_train.shape[1:]
model = Sequential([
    LSTM(64, activation='relu', input_shape=input_shape, return_sequences=True),
    LSTM(32, activation='relu'),
    Dense(len(air_quality_features))
])


  super().__init__(**kwargs)


In [21]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [22]:
optimizer = Adam(learning_rate=0.001)

model.compile(optimizer=optimizer, loss='mse')



# Early stopping callback to stop training when validation loss stops improving
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
model.fit(X_train, y_train, 
          epochs=50, 
          batch_size=128, 
          validation_split=0.1, 
          verbose=1, 
          callbacks=[early_stopping])



Epoch 1/50
[1m3982/3982[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 13ms/step - loss: nan - val_loss: nan
Epoch 2/50
[1m3982/3982[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 13ms/step - loss: nan - val_loss: nan
Epoch 3/50
[1m3982/3982[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 14ms/step - loss: nan - val_loss: nan
Epoch 4/50
[1m1260/3982[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m35s[0m 13ms/step - loss: nan

In [None]:
# Save the model in SavedModel format (this is the default format)
model.save('city_hr_model1')


In [None]:
from datetime import datetime, timedelta


In [None]:
def predict_future(model, preprocessor, last_sequence, station_id, future_datetime, num_hours=24):
    # Generate a range of datetimes, starting from the input datetime
    future_datetimes = [future_datetime + timedelta(hours=i) for i in range(num_hours)]

    # Prepare the input sequence
    last_sequence_df = pd.DataFrame(last_sequence, columns=time_features + air_quality_features)
    last_sequence_df['City'] = station_id

    # Transform the last sequence
    last_sequence_transformed = preprocessor.transform(last_sequence_df)

    # Initialize the sequence for prediction
    current_sequence = last_sequence_transformed[-seq_length:]

    future_predictions = []

    for future_dt in future_datetimes:
        # Prepare the input for the next time step
        next_input = np.zeros((1, seq_length, current_sequence.shape[1]))
        next_input[0, :-1, :] = current_sequence[1:]

        # Update time features for the next step
        time_features_next = [future_dt.month, future_dt.year, future_dt.hour, future_dt.minute]
        next_input[0, -1, :len(time_features)] = preprocessor.named_transformers_['num'].transform([time_features_next + [0]*len(air_quality_features)])[0, :len(time_features)]

        # Keep the StationId encoding the same
        station_id_cols = preprocessor.named_transformers_['cat'].transform([[station_id]])
        next_input[0, -1, len(time_features) + len(air_quality_features):] = station_id_cols

        # Make prediction
        prediction = model.predict(next_input)

        # Store the prediction
        future_predictions.append(prediction[0])

        # Update the sequence for the next iteration
        current_sequence = next_input[0]
        current_sequence[-1, len(time_features):-len(station_id_cols[0])] = prediction[0]

    # Convert predictions to original scale
    future_predictions = np.array(future_predictions)
    future_predictions_inv = preprocessor.named_transformers_['num'].inverse_transform(
        np.column_stack((np.zeros((len(future_predictions), len(time_features))), future_predictions)))[:, -len(air_quality_features):]

    return future_predictions_inv, future_datetimes


In [None]:
# Get user input
'''
station_id = input("Enter StationId: ")
future_date = input("Enter future Date (YYYY-MM-DD): ")
future_time = input("Enter future Time (HH:MM:SS): ")'''

In [None]:
station_id = 'Ahmedabad'
future_date = '2024-09-27'
future_time = '12:00:00'

In [None]:
future_datetime = datetime.strptime(f"{future_date} {future_time}", "%Y-%m-%d %H:%M:%S")


In [None]:
last_sequence = df[df['StationId'] == station_id].sort_values('Datetime').iloc[-seq_length:]
last_sequence = last_sequence[time_features + air_quality_features].values


In [None]:
future_predictions, prediction_datetimes = predict_future(model, preprocessor, last_sequence, station_id, future_datetime)


In [None]:
future_df = pd.DataFrame(future_predictions, columns=air_quality_features, index=prediction_datetimes)


In [None]:
print(future_df)

# Print the prediction for the specific datetime entered by the user
specific_prediction = future_df.loc[future_datetime]
print(f"\nPrediction for {future_datetime} at station {station_id}:")
for feature, value in specific_prediction.items():
    print(f"{feature}: {value:.2f}")