In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
# Load data
train_data = pd.read_csv("Train.csv")
test_data = pd.read_csv("Test.csv")

In [10]:
# Handle missing values
# Filter columns with missing values
columns_with_missing_values_train = train_data.columns[train_data.isnull().any()]
columns_with_missing_values_test = test_data.columns[test_data.isnull().any()]

In [12]:
# Fill missing values in float-type columns with their mean
for column in columns_with_missing_values_train:
    if train_data[column].dtype == 'float64':  # Check if column is float-type
        train_data[column].fillna(train_data[column].mean(), inplace=True)

In [13]:
for column in columns_with_missing_values_test:
    if test_data[column].dtype == 'float64':  # Check if column is float-type
        test_data[column].fillna(test_data[column].mean(), inplace=True)

In [16]:
# Feature engineering
train_data['daily_average'] = train_data['target']
test_data['daily_average'] = 0  # Placeholder, will be updated later

In [17]:
# Feature Selection/Dimensionality Reduction
X_train = train_data.drop(['target'], axis=1)
y_train = train_data['target']
X_test = test_data

In [18]:
# Model Selection and Training
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [19]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

ValueError: could not convert string to float: '6TF1KNR X 2020-03-15'

In [None]:




# Merge data
merged_train = pd.merge(train_data, weather_data, on=['location', 'date'], how='left')
merged_train = pd.merge(merged_train, satellite_data, on=['location', 'date'], how='left')

merged_test = pd.merge(test_data, weather_data, on=['location', 'date'], how='left')
merged_test = pd.merge(merged_test, satellite_data, on=['location', 'date'], how='left')

# Handle missing values
merged_train.fillna(method='ffill', inplace=True)
merged_test.fillna(method='ffill', inplace=True)

# Feature engineering
merged_train['daily_average'] = merged_train['target']
merged_test['daily_average'] = 0  # Placeholder, will be updated later

# Feature Selection/Dimensionality Reduction
X_train = merged_train.drop(['target'], axis=1)
y_train = merged_train['target']
X_test = merged_test

# Model Selection and Training
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Train different regression models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "SVR": SVR()
}

best_model = None
best_mae = np.inf

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    mae = mean_absolute_error(y_val, y_pred)
    print(f"{name} MAE: {mae}")
    if mae < best_mae:
        best_mae = mae
        best_model = model

# Optimize hyperparameters
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train_scaled, y_train)
print("Best parameters:", grid_search.best_params_)

# Model Evaluation
y_val_pred = best_model.predict(X_val_scaled)
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
print("Validation MAE:", mae)
print("Validation MSE:", mse)
print("Validation R-squared:", r2)

# Prediction
X_test_scaled = scaler.transform(X_test)
test_predictions = best_model.predict(X_test_scaled)

# Prepare submission
submission = pd.DataFrame({'Place_ID X Date': test_data['Place_ID X Date'], 'target': test_predictions})
submission.to_csv('submission.csv', index=False)
