In [1]:
import pandas as pd
import numpy as np

# Define file paths (uploaded files)
file_paths = [
    'Covid-19-R.csv',
    'Covid-19-aggregated.csv'
]

# Define the processing function
def load_and_process(file_path):
    """
    Load data, clean missing values, select relevant columns, and filter by the date range (May 2020 to May 2022).
    """
    # Load the data
    data = pd.read_csv(file_path)
    
    # Print first few rows to inspect the data
    print(f"Loaded file: {file_path}")
    print(data.head())
    
    # Check if expected columns are missing in 'Covid-19-R.csv'
    if 'cases' not in data.columns and 'Number of cases' in data.columns:
        data.rename(columns={'Number of cases': 'cases'}, inplace=True)
    
    # If 'Covid-19-R.csv' has the 'R' column, treat it as 'cases'
    if 'R' in data.columns:
        data.rename(columns={'R': 'cases'}, inplace=True)
    
    # Convert "Time Stamp" column to datetime format
    data['Time Stamp'] = pd.to_datetime(data['Time Stamp'], errors='coerce')
    
    # Filter data between May 2020 and May 2022
    start_date = '2020-05-01'
    end_date = '2020-08-31'
    filtered_data = data[(data['Time Stamp'] >= start_date) & (data['Time Stamp'] <= end_date)]
    
    # Fill missing values with forward fill (ffill)
    filtered_data = filtered_data.ffill()  # Use ffill directly to avoid warning
    
    # Select relevant columns for each dataset
    if 'cases' in filtered_data.columns:
        filtered_data = filtered_data[['Time Stamp', 'cases']]  # If only cases column exists
    else:
        print(f"Warning: Expected columns not found in {file_path}")
    
    # Feature Engineering: Extract time-related features
    filtered_data['year'] = filtered_data['Time Stamp'].dt.year
    filtered_data['month'] = filtered_data['Time Stamp'].dt.month
    filtered_data['day'] = filtered_data['Time Stamp'].dt.day
    filtered_data['week_day'] = filtered_data['Time Stamp'].dt.weekday  # 0=Monday, 6=Sunday
    
    # If applicable, create infection rate or other features based on 'cases'
    # Example: infection rate (cases per day) or some other metric depending on data availability
    filtered_data['infection_rate'] = filtered_data['cases'] 
    
    return filtered_data

# Process the data files
processed_data = []
for file_path in file_paths:
    data = load_and_process(file_path)
    processed_data.append(data)

# Combine all processed data
combined_data = pd.concat(processed_data, ignore_index=True)

# Display the combined data
print("Combined processed data:")
print(combined_data.head())




Loaded file: Covid-19-R.csv
   Unnamed: 0  Time Stamp         Region   Latitude   Longitude          R
0           0  03-16-2020       Alhambra  34.093042 -118.127060   4.750016
1           1  03-16-2020        Arcadia  34.136208 -118.040150   0.000000
2           2  03-16-2020  Beverly Hills  34.069650 -118.396306   0.000000
3           3  03-16-2020  Boyle Heights  34.043689 -118.209768   1.000008
4           4  03-16-2020         Carson  33.832204 -118.251755  16.000019
Loaded file: Covid-19-aggregated.csv
   Time Stamp         Region   Latitude   Longitude  Number of cases
0  03-16-2020       Alhambra  34.093042 -118.127060                2
1  03-16-2020        Arcadia  34.136208 -118.040150                1
2  03-16-2020  Beverly Hills  34.069650 -118.396306                1
3  03-16-2020  Boyle Heights  34.043689 -118.209768                5
4  03-16-2020         Carson  33.832204 -118.251755                1
Combined processed data:
  Time Stamp     cases  year  month  day  week

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from xgboost import XGBRegressor

# Assuming the data is already loaded and preprocessed into `combined_data`
# For the sake of this example, we'll use 'cases' and 'year', 'month', 'day', 'week_day' as features

# Prepare the features and target variable
X = combined_data[['year', 'month', 'day', 'week_day']]  # Use time-based features
y = combined_data['cases']  # Target: the number of cases

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# 1. ARIMA Model (for time series forecasting)
def arima_model(train, test):
    # ARIMA model
    model = ARIMA(train, order=(5, 1, 0))  # Adjust p, d, q as needed
    model_fit = model.fit()
    y_pred_arima = model_fit.forecast(len(test))
    return y_pred_arima

# 2. LSTM Model (Deep Learning)
def lstm_model(X_train, X_test, y_train, y_test):
    # Reshape data for LSTM
    X_train_lstm = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test_lstm = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)
    
    # Define LSTM model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=False, input_shape=(X_train_lstm.shape[1], 1)))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train the model
    model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=1)
    
    # Make predictions
    y_pred_lstm = model.predict(X_test_lstm)
    return y_pred_lstm.flatten()

# 3. Random Forest Regressor
def random_forest_model(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred_rf = model.predict(X_test)
    return y_pred_rf

# 4. XGBoost Regressor
def xgboost_model(X_train, X_test, y_train, y_test):
    model = XGBRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred_xgb = model.predict(X_test)
    return y_pred_xgb

# Evaluate Models
def evaluate_model(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return mae, rmse

# Run ARIMA model
y_pred_arima = arima_model(y_train, y_test)
mae_arima, rmse_arima = evaluate_model(y_test, y_pred_arima)

# Run LSTM model
y_pred_lstm = lstm_model(X_train, X_test, y_train, y_test)
mae_lstm, rmse_lstm = evaluate_model(y_test, y_pred_lstm)

# Run Random Forest model
y_pred_rf = random_forest_model(X_train, X_test, y_train, y_test)
mae_rf, rmse_rf = evaluate_model(y_test, y_pred_rf)

# Run XGBoost model
y_pred_xgb = xgboost_model(X_train, X_test, y_train, y_test)
mae_xgb, rmse_xgb = evaluate_model(y_test, y_pred_xgb)

# Print performance comparison
print("Model Performance Comparison:")
print(f"ARIMA - MAE: {mae_arima}, RMSE: {rmse_arima}")
print(f"LSTM - MAE: {mae_lstm}, RMSE: {rmse_lstm}")
print(f"Random Forest - MAE: {mae_rf}, RMSE: {rmse_rf}")
print(f"XGBoost - MAE: {mae_xgb}, RMSE: {rmse_xgb}")



Epoch 1/10


  super().__init__(**kwargs)


[1m854/854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - loss: 328488.5625
Epoch 2/10
[1m854/854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 317516.0312
Epoch 3/10
[1m854/854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 300287.1875
Epoch 4/10
[1m854/854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - loss: 265999.6250
Epoch 5/10
[1m854/854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 257830.4531
Epoch 6/10
[1m854/854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 248898.3438
Epoch 7/10
[1m854/854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 251828.3750
Epoch 8/10
[1m854/854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 248698.9375
Epoch 9/10
[1m854/854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 248516.1250
Epoch 10/10
[1m854/854[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt

# Load and preprocess data (same as before)
# Assuming combined_data is already available
# Feature Engineering: Prepare the dataset for LSTM
X = combined_data[['year', 'month', 'day', 'week_day']].values  # Features based on date/time
y = combined_data['cases'].values  # Target variable (number of cases)

# Normalize the data (LSTM requires scaled data)
scaler_X = MinMaxScaler(feature_range=(0, 1))
scaler_y = MinMaxScaler(feature_range=(0, 1))

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Reshape X for LSTM (samples, timesteps, features)
X_scaled = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, shuffle=False)

# Build the LSTM model with improved architecture
model = Sequential()
model.add(LSTM(units=100, return_sequences=False, input_shape=(X_train.shape[1], 1)))  # Increased units for more complexity
model.add(Dense(units=50))  # Added a dense layer for additional complexity
model.add(Dense(units=1))  # Output layer to predict 'cases'
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model (increase epochs for better training)
history = model.fit(X_train, y_train, epochs=30, batch_size=64, verbose=1)  # Increased epochs and batch size

# Function to predict the next day and next week for a given date
def predict_for_date(input_date):
    # Convert the date to appropriate features (year, month, day, weekday)
    input_date = pd.to_datetime(input_date)
    year = input_date.year
    month = input_date.month
    day = input_date.day
    week_day = input_date.weekday()  # 0=Monday, 6=Sunday
    
    # Prepare the input for prediction
    input_features = np.array([[year, month, day, week_day]])
    input_scaled = scaler_X.transform(input_features)
    input_scaled = input_scaled.reshape(1, input_scaled.shape[1], 1)
    
    # Predict for the next day (next time step)
    next_day_pred = model.predict(input_scaled)
    next_day_pred_actual = scaler_y.inverse_transform(next_day_pred)
    print(f"Prediction for the next day: {next_day_pred_actual[0][0]} cases")

    # Predict for the next week (7 days ahead)
    next_week_pred = []
    for i in range(7):
        next_day_pred = model.predict(input_scaled)
        next_day_pred_actual = scaler_y.inverse_transform(next_day_pred)
        next_week_pred.append(next_day_pred_actual[0][0])
        
        # Update the input for the next day (rolling prediction)
        input_features = np.array([[year, month, day + 1, week_day]])  # Adjust date (simple increment)
        input_scaled = scaler_X.transform(input_features)
        input_scaled = input_scaled.reshape(1, input_scaled.shape[1], 1)
        
    print(f"Predictions for the next week (7 days): {next_week_pred}")
    
    return next_day_pred_actual[0][0], next_week_pred

# Function to compare the predictions with the actual data (extract actual values for requested dates)
def compare_predictions_with_actual(predictions, actual_dates):
    # Make sure to format the dates properly for comparison
    actual_data = combined_data[combined_data['Time Stamp'].isin(actual_dates)]
    if len(actual_data) == 0:
        print("No actual data found for these dates.")
        return
    
    # Extract actual values for comparison
    actual_values = actual_data['cases'].values
    
    # If we don't have data for all predicted days, match the nearest available dates
    if len(predictions) != len(actual_values):
        print("Mismatch in the number of predictions and actual values, finding nearest dates...")
        actual_dates = [str(date.date()) for date in actual_dates]  # Convert datetime to string for easier comparison
        predictions = [predictions[i] for i in range(len(actual_dates))]  # Ensure prediction length matches

    # Calculate MAE and RMSE for comparison
    mae = np.mean(np.abs(actual_values - np.array(predictions)))
    rmse = np.sqrt(np.mean((actual_values - np.array(predictions)) ** 2))
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse}")
    
    return mae, rmse

# Save in the new format
model.save('my_model.keras')



Epoch 1/30


  super().__init__(**kwargs)


[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - loss: 0.0062
Epoch 2/30
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 0.0057
Epoch 3/30
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 0.0056
Epoch 4/30
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 0.0058
Epoch 5/30
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 0.0058
Epoch 6/30
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 0.0057
Epoch 7/30
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 0.0053
Epoch 8/30
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 0.0059
Epoch 9/30
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - loss: 0.0059
Epoch 10/30
[1m427/427[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - los

In [10]:
from tensorflow.keras.models import load_model

# Load the pre-trained model
model = load_model('my_model.keras')  # Load the trained model

# Function to predict for a given date
def predict_for_date(input_date):
    # Convert the date to appropriate features (year, month, day, weekday)
    input_date = pd.to_datetime(input_date)
    year = input_date.year
    month = input_date.month
    day = input_date.day
    week_day = input_date.weekday()  # 0=Monday, 6=Sunday
    
    # Prepare the input for prediction
    input_features = np.array([[year, month, day, week_day]])
    input_scaled = scaler_X.transform(input_features)
    input_scaled = input_scaled.reshape(1, input_scaled.shape[1], 1)
    
    # Predict for the next day (next time step)
    next_day_pred = model.predict(input_scaled)
    next_day_pred_actual = scaler_y.inverse_transform(next_day_pred)
    print(f"Prediction for the next day: {next_day_pred_actual[0][0]} cases")

    # Predict for the next week (7 days ahead)
    next_week_pred = []
    for i in range(7):
        next_day_pred = model.predict(input_scaled)
        next_day_pred_actual = scaler_y.inverse_transform(next_day_pred)
        next_week_pred.append(next_day_pred_actual[0][0])
        
        # Update the input for the next day (rolling prediction)
        input_features = np.array([[year, month, day + 1, week_day]])  # Adjust date (simple increment)
        input_scaled = scaler_X.transform(input_features)
        input_scaled = input_scaled.reshape(1, input_scaled.shape[1], 1)
        
    print(f"Predictions for the next week (7 days): {next_week_pred}")
    
    return next_day_pred_actual[0][0], next_week_pred

# Function to compare the predictions with the actual data (extract actual values for requested dates)
def compare_predictions_with_actual(predictions, actual_dates):
    # Make sure to format the dates properly for comparison
    actual_data = combined_data[combined_data['Time Stamp'].isin(actual_dates)]
    if len(actual_data) == 0:
        print("No actual data found for these dates.")
        return
    
    # Extract actual values for comparison
    actual_values = actual_data['cases'].values
    
    # If we don't have data for all predicted days, match the nearest available dates
    if len(predictions) != len(actual_values):
        print("Mismatch in the number of predictions and actual values, finding nearest dates...")
        actual_dates = [str(date.date()) for date in actual_dates]  # Convert datetime to string for easier comparison
        predictions = [predictions[i] for i in range(len(actual_dates))]  # Ensure prediction length matches

    # Calculate MAE and RMSE for comparison
    mae = np.mean(np.abs(actual_values - np.array(predictions)))
    rmse = np.sqrt(np.mean((actual_values - np.array(predictions)) ** 2))
    print(f"Actual Values: {actual_values}")
    print(f"Predicted Values: {predictions}")
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse}")
    
    return mae, rmse

# Input a specific date to get predictions for the next day and next week
input_date = "2021-05-01"  # change this date manually
predicted_next_day, predicted_next_week = predict_for_date(input_date)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
Prediction for the next day: 450.8055114746094 cases
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Predictions for the next week (7 days): [450.8055, 454.76422, 454.76422, 454.76422, 454.76422, 454.76422, 454.76422]


In [11]:
import pandas as pd

# Path to your uploaded file
file_path = 'Covid-19.csv'

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe to ensure it's loaded correctly
print(df.head())


  Time Stamp           Region   Latitude   Longitude  Number of cases
0  01-1-2021            Acton  34.480742 -118.186838              271
1  01-1-2021  Adams-Normandie  34.031788 -118.300247              766
2  01-1-2021     Agoura Hills  34.147910 -118.765704              593
3  01-1-2021         Alhambra  34.093042 -118.127060             4241
4  01-1-2021           Alsace  33.988000 -118.347620             1016


In [12]:
import pandas as pd

# Load CSV file into a pandas DataFrame
file_path = 'Covid-19.csv'
df = pd.read_csv(file_path)

# Convert 'Time Stamp' to datetime format automatically, handling mixed formats
df['Time Stamp'] = pd.to_datetime(df['Time Stamp'], errors='coerce')

# Function to extract cases for a specific date and return the total number of cases
def extract_cases_for_date(date_str):
    # Convert the input date to datetime format
    input_date = pd.to_datetime(date_str, errors='coerce')
    
    # Filter the dataset to get all rows matching the specified date
    date_data = df[df['Time Stamp'] == input_date]
    
    if len(date_data) == 0:
        print("No data available for the specified date.")
        return None
    
    # Calculate the total number of cases for the date
    total_cases = date_data['Number of cases'].sum()
    print(f"Total cases on {input_date.date()}: {total_cases}")
    
    return total_cases

# Extract total cases for a specific date
date_str = "2021-05-01"  # Change this to any date in dataset
total_cases = extract_cases_for_date(date_str)


Total cases on 2021-05-01: 1102052
