In [None]:
import pandas as pd
import numpy as np

In [None]:
def load_and_preprocess_data(filepath, is_testing=False):
    # Read the CSV file 
    if is_testing:
        data = pd.read_csv(filepath, header=None)
        data.columns = ['Date', 'ItemName', '0', '1', '2', '3', '4', '5', '6', '7', '8']
    else:
        data = pd.read_csv(filepath)
        data.drop(['Location'], axis=1, inplace=True)

    # Replace non-numeric values with NaN
    non_numeric = {'#': np.nan, '*': np.nan, 'x': np.nan, 'A': np.nan}
    data.replace(non_numeric, inplace=True)

    # Convert the columns to numeric, except 'Location', 'Date', and 'ItemName'
    numeric_columns = data.columns.difference(['Location', 'Date', 'ItemName'])
    data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

    # Fill NaN values with the mean of their respective columns
    data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

    

    # Strip the 'ItemName' column
    data['ItemName'] = data['ItemName'].str.strip()

    return data

# Use the function to load and preprocess your data
data = load_and_preprocess_data('train.csv')
test_data = load_and_preprocess_data('test.csv', is_testing=True)
print(data.head())
print(test_data.head())


In [None]:

def extract_features_targets(dataframe, is_train=True):
    # Initialize lists to store features and targets
    features_list = []
    target_list = []

    # Unique dates in the DataFrame
    unique_dates = dataframe['Date'].unique()

    # Loop through each date to extract features and target
    for date in unique_dates:
        # Filter the DataFrame for the current date
        daily_data = dataframe[dataframe['Date'] == date]

        # Ensure that the data is sorted by ItemName to maintain consistent feature order
        daily_data = daily_data.sort_values('ItemName')

        # Extract the feature data for hours 0-9 for all elements
        daily_features = daily_data.iloc[:, 2:11].values.flatten()  # Assuming 3rd column is hour 0

        # Add the extracted features to the features list
        features_list.append(daily_features)

        if not is_train:
            continue

        # Extract the target data (PM2.5 at 10 AM)
        pm25_data = daily_data[daily_data['ItemName'] == 'PM2.5']
        pm25_at_10am = pm25_data.iloc[0, 11] if not pm25_data.empty else np.nan  # Assuming 12th column is hour 10

        # Add the target value to the target list
        target_list.append(pm25_at_10am)

    # Convert lists to numpy arrays
    features_array = np.array(features_list)
    target_array = np.array(target_list)

    return features_array, target_array


# Now you can call the function with your DataFrame
features, targets = extract_features_targets(data)
test_features, _ = extract_features_targets(test_data, is_train=False)

# Print shapes to confirm dimensions
print('Features shape:', features.shape)
print('Targets shape:', targets.shape)
print('Test Features shape:', test_features.shape)

In [None]:


# Convert to NumPy arrays
X = np.array(features)
X = X.astype(float)
y = np.array(targets)
y = y.astype(float)

X_test = np.array(test_features)
X_test = X_test.astype(float)
print(X[0])
print(y[0])
print(X_test[0])

print(X.shape, y.shape)
print(X_test.shape)

In [None]:
def train(x_data, y_data, length_of_features):
    b = 0.0
    w = np.ones(length_of_features)
    lr = 0.005
    epoch = 5000000
    b_lr = 0.0
    w_lr = np.zeros(length_of_features)
    lambda_value = 0
    
    for e in range(epoch):
        # y_data = b + w * x_data
        error = y_data - b - np.dot(x_data, w) 

        # Calculate gradient
        b_grad = -2 * np.sum(error) * 1
        w_grad = -2 * np.dot(error, x_data) + 2 * lambda_value * w
        
        # Update sum of squares of gradients
        b_lr = b_lr + np.square(b_grad)
        w_lr = w_lr + np.square(w_grad)

        # Update parameters
        b = b - lr / np.sqrt(b_lr) * b_grad
        w = w - lr / np.sqrt(w_lr) * w_grad
        
        loss = np.mean(np.square(error)) + lambda_value * np.sum(np.square(w))
        
        if (e + 1) % 1000 == 0:
            print(f'epoch {e + 1}: Loss {np.sqrt(loss)}')
    return b, w

n_features = X.shape[1]  
bias, weights = train(X, y, n_features)

In [None]:
# Predict on the test set
test_predictions = X_test.dot(weights) + bias

print(test_predictions.shape)

In [None]:
import csv
header = ['index', 'answer']
data = [['Alex', 62, 80], ['Brad', 45, 56], ['Joey', 85, 98]]
filename = 'output.csv'
with open(filename, 'w') as file:
    csvwriter = csv.writer(file)
    csvwriter.writerow(header)
    for idx, row in enumerate(test_predictions):
        file.write(f"index_{idx}" + ', ' + str(row))
        file.write('\n')