In [475]:
import pandas as pd
import numpy as np

In [476]:
LEARNING_RATE = 0.05
N_ITERATIONS = 10000000
NUM_FEATURES = 2

In [477]:
def load_and_preprocess_data(filepath, is_testing=False):
    # Read the CSV file 
    if is_testing:
        data = pd.read_csv(filepath, header=None)
        data.columns = ['Date', 'ItemName', '0', '1', '2', '3', '4', '5', '6', '7', '8']
    else:
        data = pd.read_csv(filepath)
        data.drop(['Location'], axis=1, inplace=True)

    # Replace non-numeric values with NaN
    non_numeric = {'#': np.nan, '*': np.nan, 'x': np.nan, 'A': np.nan}
    data.replace(non_numeric, inplace=True)

    # Convert the columns to numeric, except 'Location', 'Date', and 'ItemName'
    numeric_columns = data.columns.difference(['Location', 'Date', 'ItemName'])
    data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

    # Fill NaN values with the median of their respective columns
    data[numeric_columns] = data[numeric_columns].fillna(0)

    

    # Strip the 'ItemName' column
    data['ItemName'] = data['ItemName'].str.strip()

    return data

# Use the function to load and preprocess your data
data = load_and_preprocess_data('train.csv')
test_data = load_and_preprocess_data('test.csv', is_testing=True)
print(data.head())
print(test_data.head())


        Date  ItemName      0      1      2      3      4      5      6  \
0  1/1 00:00  AMB_TEMP  11.10  11.20  11.40  11.50  11.60  11.70  11.90   
1  1/1 00:00       CH4   2.01   1.99   2.00   2.02   2.03   2.02   2.02   
2  1/1 00:00        CO   0.31   0.28   0.28   0.33   0.32   0.26   0.25   
3  1/1 00:00      NMHC   0.10   0.10   0.08   0.09   0.10   0.07   0.07   
4  1/1 00:00        NO   1.50   1.40   1.40   1.50   1.40   1.30   1.40   

       7  ...     14     15     16     17     18     19     20     21     22  \
0  12.10  ...  16.60  16.30  15.60  14.80  14.40  14.50  14.70  14.70  14.60   
1   2.01  ...   1.98   1.97   1.97   2.00   2.02   2.01   2.01   2.00   1.98   
2   0.27  ...   0.31   0.29   0.29   0.33   0.34   0.34   0.34   0.29   0.24   
3   0.08  ...   0.06   0.07   0.08   0.12   0.13   0.10   0.10   0.09   0.05   
4   1.90  ...   3.50   2.60   2.30   2.00   1.80   1.80   1.80   1.70   1.50   

      23  
0  14.40  
1   1.98  
2   0.21  
3   0.06  
4   1.40  

[

In [478]:

def extract_features_targets(dataframe, is_train=True):
    # Initialize lists to store features and targets
    features_list = []
    target_list = []

    # Unique dates in the DataFrame
    unique_dates = dataframe['Date'].unique()

    # Loop through each date to extract features and target
    for date in unique_dates:
        # Filter the DataFrame for the current date
        daily_data = dataframe[dataframe['Date'] == date]

        # Ensure that the data is sorted by ItemName to maintain consistent feature order
        daily_data = daily_data.sort_values('ItemName')

        # Extract the feature data for hours 0-9 for all elements
        daily_features = daily_data.iloc[:, 2:11].values.flatten()  # Assuming 3rd column is hour 0

        # Add the extracted features to the features list
        features_list.append(daily_features)

        if not is_train:
            continue

        # Extract the target data (PM2.5 at 10 AM)
        pm25_data = daily_data[daily_data['ItemName'] == 'PM2.5']
        pm25_at_10am = pm25_data.iloc[0, 11] if not pm25_data.empty else np.nan 

        # Add the target value to the target list
        target_list.append(pm25_at_10am)

    # Convert lists to numpy arrays
    features_array = np.array(features_list)
    target_array = np.array(target_list)

    return features_array, target_array


# Now you can call the function with your DataFrame
features, targets = extract_features_targets(data)
test_features, _ = extract_features_targets(test_data, is_train=False)

# Print shapes to confirm dimensions
print('Features shape:', features.shape)
print('Targets shape:', targets.shape)
print('Test Features shape:', test_features.shape)

Features shape: (240, 162)
Targets shape: (240,)
Test Features shape: (244, 162)


In [479]:


# Convert to NumPy arrays
X = np.array(features)
X = X.astype(float)
y = np.array(targets)
y = y.astype(float)

X_test = np.array(test_features)
X_test = X_test.astype(float)

print(X.shape, y.shape)
print(X_test.shape)

(240, 162) (240,)
(244, 162)


In [480]:
X_data_len = X.shape[0]
X_test_data_len = X_test.shape[0]
# Reshape data to separate each element's hourly data
data_reshaped = X.reshape(X_data_len, 18, 9)
test_data_reshaped = X_test.reshape(X_test_data_len, 18, 9)

# Prepare an array to hold the average values for each element and the correlations
element_means = np.zeros((18, X_data_len))
correlations = np.zeros(18)

# Calculate the mean for each element across all hours and correlate with PM2.5
for i in range(18):  # For each element
    element_means[i] = data_reshaped[:, i, :].mean(axis=1)
    correlations[i] = np.corrcoef(element_means[i], y)[0, 1]

# Find indices of the top 5 elements with the highest absolute correlation, excluding PM2.5
top_elements_indices = np.argsort(-np.abs(correlations))[:NUM_FEATURES]

# Extract these elements' data from training and testing datasets
selected_features_train = data_reshaped[:, top_elements_indices, :]
selected_features_test = test_data_reshaped[:, top_elements_indices, :]

# Reshape back to (num_days, num_elements*hours_per_day)
selected_features_train = selected_features_train.reshape(X_data_len, len(top_elements_indices)*9)
selected_features_test = selected_features_test.reshape(X_test_data_len, len(top_elements_indices)*9)

print("selected_feature_index:", top_elements_indices)
print("Selected features shape (training):", selected_features_train.shape)
print("Selected features shape (testing):", selected_features_test.shape)


selected_feature_index: [9 8]
Selected features shape (training): (240, 18)
Selected features shape (testing): (244, 18)


In [481]:
def mse_error(error):
    return np.mean(error**2)

def normalization(x_data):
    maxi = np.max(x_data, axis=0)
    mini = np.min(x_data, axis=0)
    x_data = (x_data - mini) / (maxi - mini + 1e-10)
    return maxi, mini, x_data

def gradient_descent(x_data, y_data, length_of_features):
    b = 0.0
    w = np.ones(length_of_features)
    lr = LEARNING_RATE
    b_lr = 0.0
    w_lr = np.zeros(length_of_features)
    lambda_value = 0
    
    for e in range(N_ITERATIONS):
        # y_data = b + w * x_data
        error = y_data - b - np.dot(x_data, w) 

        # Calculate gradient
        b_grad = -2 * np.sum(error) * 1
        w_grad = -2 * np.dot(error, x_data) + 2 * lambda_value * w
        
        # Update sum of squares of gradients
        b_lr = b_lr + np.square(b_grad)
        w_lr = w_lr + np.square(w_grad)

        # Update parameters
        b = b - lr / np.sqrt(b_lr) * b_grad
        w = w - lr / np.sqrt(w_lr) * w_grad
        
        loss = mse_error(error) + lambda_value * np.sum(np.square(w))
        
        if (e + 1) % 10000 == 0:
            print(f'epoch {e + 1}: RMSE Loss {np.sqrt(loss)}')
    return b, w

n_features = selected_features_train.shape[1]  
maxi, mini, selected_features_train = normalization(selected_features_train)
bias, weights = gradient_descent(selected_features_train, y, n_features)

epoch 10000: RMSE Loss 5.838056410981512
epoch 20000: RMSE Loss 5.627466773016153
epoch 30000: RMSE Loss 5.4836657224722325
epoch 40000: RMSE Loss 5.372911705766237
epoch 50000: RMSE Loss 5.283255668466009
epoch 60000: RMSE Loss 5.208874315515187
epoch 70000: RMSE Loss 5.14615104434245
epoch 80000: RMSE Loss 5.092545556837525
epoch 90000: RMSE Loss 5.04617755065554
epoch 100000: RMSE Loss 5.005621985279591
epoch 110000: RMSE Loss 4.969784452028697
epoch 120000: RMSE Loss 4.937816427497141
epoch 130000: RMSE Loss 4.909054679208567
epoch 140000: RMSE Loss 4.882976874755829
epoch 150000: RMSE Loss 4.859168602273021
epoch 160000: RMSE Loss 4.837298630477809
epoch 170000: RMSE Loss 4.8171002092452975
epoch 180000: RMSE Loss 4.798356846644507
epoch 190000: RMSE Loss 4.7808914314898665
epoch 200000: RMSE Loss 4.764557873655735
epoch 210000: RMSE Loss 4.749234650447156
epoch 220000: RMSE Loss 4.7348198033649656
epoch 230000: RMSE Loss 4.721227043553988
epoch 240000: RMSE Loss 4.708382708197244

In [482]:
# Predict on the test set
selected_features_test = (selected_features_test - mini) / (maxi - mini + 1e-10)
test_predictions = selected_features_test.dot(weights) + bias

print(test_predictions.shape)

(244,)


In [483]:
import csv
header = ['index', 'answer']
filename = 'output.csv'
with open(filename, 'w') as file:
    csvwriter = csv.writer(file)
    csvwriter.writerow(header)
    for idx, row in enumerate(test_predictions):
        file.write(f"index_{idx}" + ', ' + str(row))
        file.write('\n')