In [2]:
import pandas as pd

# Load the datasets
train = pd.read_csv('../kaggle_data/train.csv')
weather = pd.read_csv('../kaggle_data/weather.csv')


In [3]:
#Convert date column of both datasets to datetime
train['Date'] = pd.to_datetime(train['Date'])
weather['Date'] = pd.to_datetime(weather['Date'])

#Print the datatypes of the columns
print(train.dtypes)

Date                      datetime64[ns]
Address                           object
Species                           object
Block                              int64
Street                            object
Trap                              object
AddressNumberAndStreet            object
Latitude                         float64
Longitude                        float64
AddressAccuracy                    int64
NumMosquitos                       int64
WnvPresent                         int64
dtype: object


### Step 2: Feature Engineering

In [8]:
# Merge the datasets on the 'Date' column
combined_data = pd.merge(train, weather, on='Date')


Create Time-based Features: From the date column, you can create additional features like month, week, day of the week, etc.

In [9]:
# Create time-based features
combined_data['Month'] = combined_data['Date'].dt.month
combined_data['Week'] = combined_data['Date'].dt.isocalendar().week
combined_data['DayOfWeek'] = combined_data['Date'].dt.dayofweek


### Combining distance to spraying locations as well

Distance to Spraying Locations: This step requires the spray data. Load the spray data and create a feature representing the distance from each trap to the nearest spraying location.

In [11]:
# Load the spray data
from geopy.distance import geodesic
import numpy as np

def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).miles

# Calculate the distance from each trap to the nearest spraying location
min_distances = []
total_rows = len(combined_data)

for index, row in combined_data.iterrows():
    distances = [calculate_distance(row['Latitude'], row['Longitude'], lat, lon) for lat, lon in zip(spray['Latitude'], spray['Longitude'])]
    min_distances.append(min(distances))
    
    # Print progress
    progress = (index + 1) / total_rows * 100
    print(f"Progress: {progress:.2f}%")

combined_data['DistanceToSpray'] = min_distances


Progress: 0.00%
Progress: 0.01%
Progress: 0.01%
Progress: 0.02%
Progress: 0.02%
Progress: 0.03%
Progress: 0.03%
Progress: 0.04%
Progress: 0.04%
Progress: 0.05%
Progress: 0.05%
Progress: 0.06%
Progress: 0.06%
Progress: 0.07%
Progress: 0.07%
Progress: 0.08%
Progress: 0.08%
Progress: 0.09%
Progress: 0.09%
Progress: 0.10%
Progress: 0.10%
Progress: 0.10%
Progress: 0.11%
Progress: 0.11%
Progress: 0.12%
Progress: 0.12%
Progress: 0.13%
Progress: 0.13%
Progress: 0.14%
Progress: 0.14%
Progress: 0.15%
Progress: 0.15%
Progress: 0.16%
Progress: 0.16%
Progress: 0.17%
Progress: 0.17%
Progress: 0.18%
Progress: 0.18%
Progress: 0.19%
Progress: 0.19%
Progress: 0.20%
Progress: 0.20%
Progress: 0.20%
Progress: 0.21%
Progress: 0.21%
Progress: 0.22%
Progress: 0.22%
Progress: 0.23%
Progress: 0.23%
Progress: 0.24%
Progress: 0.24%
Progress: 0.25%
Progress: 0.25%
Progress: 0.26%
Progress: 0.26%
Progress: 0.27%
Progress: 0.27%
Progress: 0.28%
Progress: 0.28%
Progress: 0.29%
Progress: 0.29%
Progress: 0.30%
Progress

Weather Conditions: Create features that represent the weather conditions on the days leading up to the date when mosquitoes were caught.

In [None]:
# Example: Creating a feature for the average temperature 3 days before the current date
combined_data['Tavg_3days_before'] = combined_data.groupby('Station')['Tavg'].transform(lambda x: x.rolling(3, min_periods=1).mean().shift())


In [None]:
# Export the combined dataset
combined_data.to_csv('../kaggle_data/combined_data.csv', index=False)

## EDA

# Initial Data Understanding

## Train Data

- The train data has 10,506 rows and 12 columns.
- Columns and their data types:
    - Date (object): The date that the WNV test is performed.
    - Address (object): Approximate address of the location of the trap.
    - Species (object): The species of mosquitoes.
    - Block (int64): Block number of the address.
    - Street (object): Street name.
    - Trap (object): ID of the trap.
    - AddressNumberAndStreet (object): Approximate address returned from GeoCoder.
    - Latitude (float64): Latitude returned from GeoCoder.
    - Longitude (float64): Longitude returned from GeoCoder.
    - AddressAccuracy (int64): Accuracy returned from GeoCoder.
    - NumMosquitos (int64): Number of mosquitoes caught in this trap.
    - WnvPresent (int64): Whether West Nile Virus was present in these mosquitoes. 1 means WNV is present, and 0 means not present.
- There are no missing values in the train data.
- The 'NumMosquitos' column has a mean of approximately 12.85, with a standard deviation of approximately 16.13.
- The 'WnvPresent' column is binary, with values of 0 or 1. The mean is approximately 0.052, indicating that a small percentage of the observations have West Nile Virus present.

## Weather Data

- The weather data has 2,944 rows and 22 columns.
- Columns and their data types:
    - Station (int64): The weather station ID (1 or 2).
    - Date (object): The date of the weather record.
    - Tmax (int64): Maximum temperature.
    - Tmin (int64): Minimum temperature.
    - Tavg (object): Average temperature.
    - Depart (object): Departure from normal.
    - DewPoint (int64): Average dew point.
    - WetBulb (object): Average wet bulb.
    - Heat (object): Heating (Season begins with July).
    - Cool (object): Cooling (Season begins with January).
    - Sunrise (object): Sunrise time.
    - Sunset (object): Sunset time.
    - CodeSum (object): Weather phenomena codes.
    - Depth (object): Snow/ice depth.
    - Water1 (object): Water equivalent.
    - SnowFall (object): Snowfall amount.
    - PrecipTotal (object): Precipitation total.
    - StnPressure (object): Average station pressure.
    - SeaLevel (object): Average sea level pressure.
    - ResultSpeed (float64): Resultant wind speed.
    - ResultDir (int64): Resultant wind direction.
    - AvgSpeed (object): Average wind speed.
- There are no missing values in the weather data.
- The 'Tmax' column has a mean of approximately 76.17, with a standard deviation of approximately 11.46.
- The 'Tmin' column has a mean of approximately 57.81, with a standard deviation of approximately 10.38.



In [None]:
# Function to add counts on the bars
def add_counts_on_bars(ax):
    for p in ax.patches:
        ax.annotate(f'{int(p.get_width())}', (p.get_x() + p.get_width(), p.get_y() + p.get_height()/2),
                    ha='left', va='center')

Analyze the Effect of Weather: Use visualization libraries like matplotlib or seaborn to analyze how different weather conditions affect the number of mosquitoes and the presence of WNV.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Example: Plotting the number of mosquitoes by month
plt.figure(figsize=(10, 6))
ax1 = sns.boxplot(data=combined_data, x='Month', y='NumMosquitos')
plt.title('Number of Mosquitoes by Month')
plt.xlabel('Month')
plt.ylabel('Number of Mosquitoes')
add_counts_on_bars(ax1)
plt.show()


Correlation Analysis: Check if there is any correlation between weather features and the number of mosquitoes or the presence of WNV.

In [None]:
# Correlation matrix
correlation_matrix = combined_data.corr()

# Plotting the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


## Modelling Preprocessing


In [None]:
# One-hot encoding for categorical variables
combined_data = pd.get_dummies(combined_data, drop_first=True)


In [None]:
from sklearn.model_selection import train_test_split

# Define the features and the target variable
X = combined_data.drop(['WnvPresent', 'Date'], axis=1)
y = combined_data['WnvPresent']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 5 Model Building

Train the Model: Choose an appropriate algorithm for classification (e.g., Random Forest, XGBoost, etc.) and train the model.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)


Evaluate the Model: Evaluate the model's performance using appropriate metrics (e.g., accuracy, F1-score, etc.).

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Tuning: If necessary, tune the model's hyperparameters to improve performance.

Feature Importance: Analyze which features are most important in predicting the presence of WNV.

In [None]:
# Feature importance
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': model.feature_importances_})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

# Plotting feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


## Gridsearch and XGBoost

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}

# Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best accuracy score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)
