In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame

file_path = '../resources/clustered_df.csv'  
clustered_df = pd.read_csv(file_path)

# Review the DataFrame

clustered_df = clustered_df.drop(columns=['Unnamed: 0', 'glac_id', 'line_type', 'rec_status', 'proc_desc', 'geog_area'])

clustered_df['gone_date'] = clustered_df['gone_date'].replace({'e': '1'})

clustered_df['glac_stat'] = clustered_df['glac_stat'].replace({'exists': 1, 'gone': 0})

clustered_df['glac_name_code'] = pd.Categorical(clustered_df['glac_name']).codes

clustered_df['glac_name_code'] = clustered_df['glac_name_code'].astype(float)

clustered_df_2 = clustered_df.copy()

clustered_df


  clustered_df['glac_stat'] = clustered_df['glac_stat'].replace({'exists': 1, 'gone': 0})


Unnamed: 0,anlys_time,src_date,db_area,min_elev,mean_elev,max_elev,primeclass,release_dt,gone_date,glac_name,glac_stat,conn_lvl,surge_type,term_type,gtng_o1reg,gtng_o2reg,rgi_gl_typ,cluster,glac_name_code
0,1262304000,1060387200,0.000000,0,0,0,0,1299393082,1,1,1,0,0,9,1,2,0,59,0.0
1,1263513600,1184544000,0.177109,0,0,0,0,1269471019,1,1,1,0,0,9,1,4,0,25,0.0
2,1262304000,1060387200,0.000000,0,0,0,0,1299393082,1,10,1,0,0,9,1,2,0,59,1.0
3,1302825600,-473385600,0.017771,0,0,0,0,1305230445,1,10,1,0,0,9,1,2,0,95,1.0
4,1262304000,1060387200,0.299353,0,0,0,0,1299393082,1,100,1,0,0,9,1,2,0,47,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,1262304000,1060387200,379.897000,0,0,0,0,1299393082,1,Yentna Glacier,1,0,0,9,1,2,0,12,897.0
2388,1302825600,-536457600,392.112000,0,0,0,0,1305230445,1,Yentna Glacier,1,0,0,9,1,2,0,12,897.0
2389,1437004800,1284249600,387.383000,263,0,3393,0,1437004800,1,Yentna Glacier,1,0,3,9,1,2,39,12,897.0
2390,1154390400,-420076800,3.116210,1803,0,3142,0,1456477200,1,Zigzag Glacier,1,0,0,9,2,4,0,90,898.0


### Step 2: Create the labels set (`y`)  from the “glac_stat” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Specify the label column name
y_df = 'glac_stat'  # Replace with the actual label column name

# Separate the y variable, the labels
y = clustered_df_2[y_df]

# Separate the X variable, the features
x = clustered_df_2.drop(columns=[y_df, 'glac_name'])


In [4]:
# Review the y variable Series
y

0       1
1       1
2       1
3       1
4       1
       ..
2387    1
2388    1
2389    1
2390    1
2391    1
Name: glac_stat, Length: 2392, dtype: int64

In [5]:
# Review the X variable DataFrame
x

Unnamed: 0,anlys_time,src_date,db_area,min_elev,mean_elev,max_elev,primeclass,release_dt,gone_date,conn_lvl,surge_type,term_type,gtng_o1reg,gtng_o2reg,rgi_gl_typ,cluster,glac_name_code
0,1262304000,1060387200,0.000000,0,0,0,0,1299393082,1,0,0,9,1,2,0,59,0.0
1,1263513600,1184544000,0.177109,0,0,0,0,1269471019,1,0,0,9,1,4,0,25,0.0
2,1262304000,1060387200,0.000000,0,0,0,0,1299393082,1,0,0,9,1,2,0,59,1.0
3,1302825600,-473385600,0.017771,0,0,0,0,1305230445,1,0,0,9,1,2,0,95,1.0
4,1262304000,1060387200,0.299353,0,0,0,0,1299393082,1,0,0,9,1,2,0,47,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,1262304000,1060387200,379.897000,0,0,0,0,1299393082,1,0,0,9,1,2,0,12,897.0
2388,1302825600,-536457600,392.112000,0,0,0,0,1305230445,1,0,0,9,1,2,0,12,897.0
2389,1437004800,1284249600,387.383000,263,0,3393,0,1437004800,1,0,3,9,1,2,39,12,897.0
2390,1154390400,-420076800,3.116210,1803,0,3142,0,1456477200,1,0,0,9,2,4,0,90,898.0


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [6]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

print(f"x_train shape: {x_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

x_train shape: (1913, 17)
x_test shape: (479, 17)
y_train shape: (1913,)
y_test shape: (479,)


## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [7]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model

# Assign a random_state parameter of 1 to the model
model = LogisticRegression(random_state=1)

# Fit the model using the training data
model.fit(x_train, y_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [8]:
# Make a prediction using the testing data
y_pred = model.predict(x_test)

predictions_df = pd.DataFrame(y_pred, columns=['Predicted_Label'])
predictions_df.to_csv('predictions.csv', index=False)

predictions_df

Unnamed: 0,Predicted_Label
0,1
1,1
2,0
3,1
4,1
...,...
474,1
475,1
476,1
477,1


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [9]:
# Generate a confusion matrix for the model

# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm

array([[ 15,   0],
       [  0, 464]])

In [10]:
# Print the classification report for the model

print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00       464

    accuracy                           1.00       479
   macro avg       1.00      1.00      1.00       479
weighted avg       1.00      1.00      1.00       479



In [11]:
# ABOVE SEEMS LIKE IT'S LIKELY OVERFITTED.

# RETRYING WITH MANY FEWER FEATURES. WILL TRY WITH AREA AND STATE AND ANALYSIS TIME.

clustered_df_3 = clustered_df_2.copy()

clustered_df_3 = clustered_df_3.drop(columns=['src_date', 'min_elev', 'mean_elev',
                                        'max_elev', 'primeclass', 'release_dt', 'gone_date', 'glac_name',
                                        'glac_stat', 'conn_lvl', 'surge_type', 'term_type', 'gtng_o1reg',
                                        'gtng_o2reg', 'rgi_gl_typ', 'cluster', 'glac_name_code'])


# Separate the X variable, the features # Extracting all except Area and State.
x2 = clustered_df_3
                                     

In [12]:
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x2, y, test_size=0.2, random_state=1)

print(f"x_train shape: {x_train_2.shape}")
print(f"x_test shape: {x_test_2.shape}")
print(f"y_train shape: {y_train_2.shape}")
print(f"y_test shape: {y_test_2.shape}")

x_train shape: (1913, 2)
x_test shape: (479, 2)
y_train shape: (1913,)
y_test shape: (479,)


In [13]:
# Instantiate the Logistic Regression model

# Assign a random_state parameter of 1 to the model
model_2 = LogisticRegression(random_state=1)

# Fit the model using the training data
model_2.fit(x_train_2, y_train_2)

In [14]:
# Make a prediction using the testing data
y_pred_2 = model_2.predict(x_test_2)

predictions_df_2 = pd.DataFrame(y_pred_2, columns=['Predicted_Label'])
predictions_df_2.to_csv('predictions_2.csv', index=False)

predictions_df_2

Unnamed: 0,Predicted_Label
0,1
1,1
2,1
3,1
4,1
...,...
474,1
475,1
476,1
477,1


In [15]:
cm_2 = confusion_matrix(y_test_2, y_pred_2)

cm_2

array([[  0,  15],
       [  0, 464]])

In [16]:
print("Classification Report:")
print(classification_report(y_test_2, y_pred_2))

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        15
           1       0.97      1.00      0.98       464

    accuracy                           0.97       479
   macro avg       0.48      0.50      0.49       479
weighted avg       0.94      0.97      0.95       479



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# THIS IS GIVING POOR RESULTS. GOING TO INTRODUCE TEMPERATURE INTO THE DATASET AND MAKE SMALLER CHANGES.