In [2]:
import pandas as pd

# Load the datasets
emissions_path = r"C:\Users\Tushar\Downloads\dagster\anually_emission.csv"
aqi_path = r"C:\Users\Tushar\Downloads\dagster\global AP dataset.csv"

emissions_data = pd.read_csv(emissions_path)
aqi_data = pd.read_csv(aqi_path)

# Display the first few rows to understand the data structure
emissions_data.head(), aqi_data.head()


(       Country ISO 3166-1 alpha-3  Year  Total  Coal  Oil  Gas  Cement  \
 0  Afghanistan                AFG  1750    0.0   NaN  NaN  NaN     NaN   
 1  Afghanistan                AFG  1751    0.0   NaN  NaN  NaN     NaN   
 2  Afghanistan                AFG  1752    0.0   NaN  NaN  NaN     NaN   
 3  Afghanistan                AFG  1753    0.0   NaN  NaN  NaN     NaN   
 4  Afghanistan                AFG  1754    0.0   NaN  NaN  NaN     NaN   
 
    Flaring  Other  Per Capita  
 0      NaN    NaN         NaN  
 1      NaN    NaN         NaN  
 2      NaN    NaN         NaN  
 3      NaN    NaN         NaN  
 4      NaN    NaN         NaN  ,
               Country              City  AQI Value AQI Category  CO AQI Value  \
 0  Russian Federation        Praskoveya         51     Moderate             1   
 1              Brazil  Presidente Dutra         41         Good             1   
 2               Italy   Priolo Gargallo         66     Moderate             1   
 3              Polan

In [3]:
# Clean and select relevant columns from both datasets
emissions_data_cleaned = emissions_data[['Country', 'Year', 'Per Capita']]
aqi_data_cleaned = aqi_data[['Country', 'AQI Value', 'PM2.5 AQI Value', 'Ozone AQI Value', 'NO2 AQI Value']]

# Ensure compatibility for merging: standardizing column names and dropping duplicates
emissions_data_cleaned = emissions_data_cleaned.dropna().drop_duplicates()
aqi_data_cleaned = aqi_data_cleaned.dropna().drop_duplicates()

# Merge datasets on the 'Country' column
merged_data = pd.merge(emissions_data_cleaned, aqi_data_cleaned, on='Country', how='inner')

# Check for the merged dataset structure
merged_data.head()


Unnamed: 0,Country,Year,Per Capita,AQI Value,PM2.5 AQI Value,Ozone AQI Value,NO2 AQI Value
0,Afghanistan,1950,0.011266,151,151,41,0
1,Afghanistan,1950,0.011266,117,117,44,0
2,Afghanistan,1950,0.011266,113,113,42,0
3,Afghanistan,1950,0.011266,77,77,40,0
4,Afghanistan,1950,0.011266,67,67,37,0


In [4]:
import statsmodels.api as sm

# Define dependent and independent variables for MLR
X = merged_data[['Per Capita', 'PM2.5 AQI Value', 'Ozone AQI Value', 'NO2 AQI Value']]
y = merged_data['AQI Value']

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the multiple linear regression model
mlr_model = sm.OLS(y, X).fit()

# Summarize the regression results
mlr_summary = mlr_model.summary()
mlr_summary



0,1,2,3
Dep. Variable:,AQI Value,R-squared:,0.975
Model:,OLS,Adj. R-squared:,0.975
Method:,Least Squares,F-statistic:,18280000.0
Date:,"Fri, 06 Dec 2024",Prob (F-statistic):,0.0
Time:,18:26:40,Log-Likelihood:,-7006800.0
No. Observations:,1866338,AIC:,14010000.0
Df Residuals:,1866333,BIC:,14010000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.8326,0.017,-50.273,0.000,-0.865,-0.800
Per Capita,0.0597,0.002,28.311,0.000,0.056,0.064
PM2.5 AQI Value,0.9998,0.000,7113.132,0.000,1.000,1.000
Ozone AQI Value,0.1307,0.000,493.890,0.000,0.130,0.131
NO2 AQI Value,-0.1747,0.002,-92.881,0.000,-0.178,-0.171

0,1,2,3
Omnibus:,1687393.442,Durbin-Watson:,1.728
Prob(Omnibus):,0.0,Jarque-Bera (JB):,91053327.484
Skew:,4.229,Prob(JB):,0.0
Kurtosis:,36.157,Cond. No.,235.0
