In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline

In [2]:
# suppress warnings
warnings.filterwarnings('ignore')   

#### Data

In [3]:
df_train = pd.read_csv('Train.csv')
df_train.head()

Unnamed: 0,Place_ID X Date,Date,Place_ID,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,010Q650 X 2020-01-02,2020-01-02,010Q650,38.0,23.0,53.0,769.5,92,11.0,60.200001,...,38.593017,-61.752587,22.363665,1793.793579,3227.855469,0.010579,74.481049,37.501499,-62.142639,22.545118
1,010Q650 X 2020-01-03,2020-01-03,010Q650,39.0,25.0,63.0,1319.85,91,14.6,48.799999,...,59.624912,-67.693509,28.614804,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652
2,010Q650 X 2020-01-04,2020-01-04,010Q650,24.0,8.0,56.0,1181.96,96,16.4,33.400002,...,49.839714,-78.342701,34.296977,,,,,,,
3,010Q650 X 2020-01-05,2020-01-05,010Q650,49.0,10.0,55.0,1113.67,96,6.911948,21.300001,...,29.181258,-73.896588,30.545446,,,,,,,
4,010Q650 X 2020-01-06,2020-01-06,010Q650,21.0,9.0,52.0,1164.82,95,13.900001,44.700001,...,0.797294,-68.61248,26.899694,,,,,,,


##### Characteristics

In [4]:
# Filter columns with missing values
columns_with_missing_values = df_train.columns[df_train.isnull().any()]

# Print only columns with missing values and their respective counts
#for column in columns_with_missing_values:
#    missing_count = df_train[column].isnull().sum()
#    print(f"Column '{column}' has {missing_count} missing values.")

In [5]:
# Fill missing values in float-type columns with their mean
for column in columns_with_missing_values:
    if df_train[column].dtype == 'float64':  # Check if column is float-type
        df_train[column].fillna(df_train[column].mean(), inplace=True)

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report

In [7]:
# Data Preprocessing
place_id = df_train['Place_ID']

# Separate features and target variable
X = df_train.drop(columns=['target', 'Place_ID X Date', 'Place_ID'])
y = df_train['target']

In [9]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# extract month from date column
X_train['Month'] = pd.to_datetime(X_train['Date']).dt.month
X_val['Month'] = pd.to_datetime(X_val['Date']).dt.month

In [11]:
# Drop non-numeric columns and the original date column
X_train_numeric = X_train.drop(columns=['Date'])
X_val_numeric = X_val.drop(columns=['Date'])

In [12]:
# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_numeric)
X_val_imputed = imputer.transform(X_val_numeric)

In [13]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

In [14]:
# Linear Regression
# Initialize and train the linear regression model
linreg_model = LinearRegression()
linreg_model.fit(X_train_scaled, y_train)

In [15]:
# Predict on the validation set
y_pred = linreg_model.predict(X_val_scaled)

In [16]:
# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 439.183365376352


In [None]:
# Perform k-fold cross-validation
mse_scores = cross_val_score(linreg_model, X_train_scaled, y, cv=5, scoring='neg_mean_squared_error')
mean_mse = -mse_scores.mean()

print("Mean Squared Error (Cross-Validated):", mean_mse)