In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn.manifold import TSNE
from sklearn.model_selection import KFold
import seaborn as sns

In [3]:
profiles_areas = pd.read_csv('~/Desktop/columbia/capstone/fire-regimes/data/profiles-areas.csv')
area = np.log(profiles_areas['area_ha'])
X = profiles_areas.drop(columns=['_uid_','initialdat','finaldate','area_ha'])
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X)

X_scaled = X[~(np.abs(scaled_data) > 10).any(axis=1)]
X_scaled = scaler.fit_transform(X)

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Binarize the target variable 'area' for logistic regression
area_binary = (area > area.quantile(0.75)).astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, area_binary, test_size=0.2, random_state=1)

# Fit the logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the performance metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Confusion Matrix:')
print(conf_matrix)

Simple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import f_regression

# Fit the linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_reg = lin_reg.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred_reg)
r2 = r2_score(y_test, y_pred_reg)

# Print the performance metrics
print(f'Mean Squared Error: {mse:.4f}')
print(f'R^2 Score: {r2:.4f}')

# Calculate the overall F-statistic and its p-value
f_stat_overall, p_value_overall = f_regression(X_test, y_test).mean()

print(f'Overall F-statistic: {f_stat_overall}')
print(f'Overall P-value: {p_value_overall}')


PCA Reduced Feature Space Regression

In [None]:
pca = decomposition.PCA(n_components=15)
X_pca = pca.fit_transform(X_scaled)

log_reg_pca = LogisticRegression(max_iter=1000)
log_reg_pca.fit(X_train_pca, y_train_pca)

y_pred_pca = log_reg_pca.predict(X_test_pca)

accuracy_pca = accuracy_score(y_test_pca, y_pred_pca)
precision_pca = precision_score(y_test_pca, y_pred_pca)
recall_pca = recall_score(y_test_pca, y_pred_pca)
f1_pca = f1_score(y_test_pca, y_pred_pca)
conf_matrix_pca = confusion_matrix(y_test_pca, y_pred_pca)

print(f'Accuracy (PCA): {accuracy_pca:.4f}')
print(f'Precision (PCA): {precision_pca:.4f}')
print(f'Recall (PCA): {recall_pca:.4f}')
print(f'F1 Score (PCA): {f1_pca:.4f}')
print('Confusion Matrix (PCA):')
print(conf_matrix_pca)

In [None]:
# Get the coefficients from the linear regression model
coefficients = lin_reg.coef_

# Create a dataframe to hold the coefficients and their corresponding feature names
coeff_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': coefficients})

# Sort the dataframe by the absolute value of the coefficients
coeff_df['abs_coefficient'] = coeff_df['Coefficient'].abs()
coeff_df = coeff_df.sort_values(by='abs_coefficient', ascending=False).head(20)

# Plot the coefficients
plt.figure(figsize=(10, 8))
sns.barplot(x='Coefficient', y='Feature', data=coeff_df)
plt.title('Coefficients from Linear Regression')
plt.show()