<a href="https://colab.research.google.com/github/deva3654321/deva/blob/main/project_nm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv("16_air_quality_prediction.csv")

# Display basic info
print("Initial Data Overview:")
print(df.head())
print("\nMissing Values:\n", df.isnull().sum())

# Drop rows with missing values (or handle as needed)
df = df.dropna()

# Convert 'date' column to datetime objects
df['date'] = pd.to_datetime(df['date'])

# Extract numerical features from the 'date' column
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Drop the original 'date' column
df = df.drop('date', axis=1)

# Features and Target
# One-hot encode the 'location' column to convert it to numerical data
X = pd.get_dummies(df.drop(['predicted_aqi'], axis=1),
                   columns=['location'])  # Specify the categorical column(s)
y = df['predicted_aqi']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)
print("\nModel Performance:")
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

Initial Data Overview:
         date  location  pm2.5   pm10  temperature  predicted_aqi
0  2023-01-01  Istanbul  29.05  58.73         15.4          40.71
1  2023-01-01    Ankara  24.74  83.86         26.4          50.63
2  2023-01-01     Izmir  23.90  76.91         27.8          39.67
3  2023-01-01     Bursa  43.67  49.58         30.6          36.50
4  2023-01-02  Istanbul  28.40  76.31         23.9          46.89

Missing Values:
 date             0
location         0
pm2.5            0
pm10             0
temperature      0
predicted_aqi    0
dtype: int64

Model Performance:
R² Score: 0.6340785817861676
Mean Squared Error: 32.622809578493154
