In [1]:
#Upload kml file
from pykml import parser
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial import KDTree

with open('../Resources/glims_polygons.kml', 'r', encoding="utf-8") as f:
   root = parser.parse(f).getroot()
#Put kml to dataframe  
places = []
for place in root.Document.Folder.Placemark:
    data = {item.get("name"): item.text for item in
            place.ExtendedData.SchemaData.SimpleData}
    coords = place.Polygon.outerBoundaryIs.LinearRing.coordinates.text.strip()
    data["Coordinates"] = coords
    places.append(data)
df = pd.DataFrame(places)
# print(df)

In [2]:
# df.head()

In [3]:
# df.dtypes

In [4]:
cleaned_df = df[['line_type','anlys_id','area','glac_id','anlys_time','db_area','min_elev','mean_elev','max_elev','primeclass','Coordinates','src_date','rec_status','glac_name','glac_stat',
'gone_date','gone_dt_e','subm_id','release_dt','proc_desc','rc_id','geog_area','conn_lvl','surge_type','term_type','gtng_o1reg','gtng_o2reg',
'rgi_gl_typ','loc_unc_x','glob_unc_y']]

# Counting initial number of rows to make sure we don't somehow lose any.
row_count = cleaned_df.shape[0]
print(f"Number of rows: {row_count}")

Number of rows: 69572


In [5]:
# cleaned_df.head()

In [6]:
# Dictionary to store unique values DataFrames
unique_dfs = {}

# Iterate over each column, get unique values, and create DataFrame
for column in cleaned_df.columns:
    unique_values = cleaned_df[column].unique()
    unique_dfs[column] = pd.DataFrame(unique_values, columns=[f'unique_{column}'])

# for column, unique_df in unique_dfs.items():
#     print(f"Unique values in {column} as DataFrame:")
#     print(unique_df)
#     print()

In [7]:
unique_glac_name = 'glac_name'
glac_name_df = unique_dfs.get(unique_glac_name)

glac_name_df.to_csv('../Resources/glac_name_df.csv', index=False)

In [8]:
# SEE ABOVE FOR DEFINITIONS. ELIMINATING PRIMECLASS, REC_STATUS, SUBM_ID, GONE_DT_E, PROC_DESC, RC_ID, GEOG_AREA, CONN_LVL, SURGE_TYPE, TERM_TYPE, GTNG_O1REG, GTNG_O2REG, RGI_GL_TYP,
# LOC_UNC_X, GLOB_UNC_Y

# SOME ARE NOTES, SOME ARE DEFINITIONS RELEVANT TO SOMEONE. LIKELY NOT TO US. REC_STATUS HAD ONE VALUE; 'OKAY'.

cleaned_df_2 = df[['line_type','anlys_id','area','glac_id','anlys_time','db_area','min_elev','mean_elev','max_elev','Coordinates','src_date','glac_name','glac_stat',
'gone_date','release_dt']]

In [9]:
# making a df copy
cleaned_df_3 = cleaned_df_2.copy()

# glac_stat has two unique values. replaced exists with e and gone with g
cleaned_df_3['glac_stat'] = cleaned_df_3['glac_stat'].replace({'gone': 'g', 'exists': 'e'})

# line_type has three unique values. Replaced debris_cov with dc, glac_bound with gb, intrnl_rock with ir
cleaned_df_3['line_type'] = cleaned_df_3['line_type'].replace({'debris_cov': 'dc', 'glac_bound': 'gb', 'intrnl_rock': 'ir'})

In [10]:
# Changed the times which were strings to times that are datetime objects. Took away the hour and rounded off to a day, which
# wasn't intentional, but we don't need hourly resolution for a measurement of an object that changes very slowly
# and that we are only seeing measurements for a few times per year at most

cleaned_df_3['src_date'] = pd.to_datetime(cleaned_df_3['src_date'])
cleaned_df_3['anlys_time'] = pd.to_datetime(cleaned_df_3['anlys_time'])
cleaned_df_3['release_dt'] = pd.to_datetime(cleaned_df_3['release_dt'])


Converted the times to unix. Unix times before 1/1/1970 are negative numbers, descending from 0 below 1/1/1970, so they'll still work and be
# convertible and comparable.
# Of note, minimum recording is 12/13/1901, so if there were dates before that, this would not be feasible.

cleaned_df_3['src_date'] = cleaned_df_3['src_date'].apply(lambda x: int(x.timestamp()))
cleaned_df_3['anlys_time'] = cleaned_df_3['anlys_time'].apply(lambda x: int(x.timestamp()))
cleaned_df_3['release_dt'] = cleaned_df_3['release_dt'].apply(lambda x: int(x.timestamp()))

# format for gone_date. the object wasn't recognized as a time like the other was, so I explicitly defined the format.
gone_date_raw_format = '%Y-%m-%d'

# Converting gone_date to datetime.
cleaned_df_3['gone_date'] = pd.to_datetime(cleaned_df_3['gone_date'], format=gone_date_raw_format, errors='coerce')

# Converting gone_date to unix. If the glacier still exists, it's a NaT, so replacing it with e for exists.
nat_replace_if_exists = 'e'
cleaned_df_3['gone_date'] = cleaned_df_3['gone_date'].apply(
    lambda x: int(x.timestamp()) if pd.notna(x) else nat_replace_if_exists
)

SyntaxError: unterminated string literal (detected at line 10) (3022640951.py, line 10)

In [None]:
# Copying data frame

cleaned_df_4 = cleaned_df_3.copy()

# Converting columns to float. errors='coerce' won't effect 0 values, but any non-number will be converted to nan. Will print between this conversion and
# the formatting just to make sure.

cleaned_df_4['area'] = pd.to_numeric(cleaned_df_4['area'], errors='coerce')
cleaned_df_4['db_area'] = pd.to_numeric(cleaned_df_4['db_area'], errors='coerce')

nat_replace = 'x'

# nan_rows # no nan rows for either area or db_area, so we're good, per the below check.
# nan_rows = cleaned_df_4[cleaned_df_4['db_area'].isna()]

# converting to float with the trailing digits removed. The number is the number of the significant digits we round to.

for column in ['area', 'db_area']:
    cleaned_df_4[column] = cleaned_df_4[column].apply(lambda x: '{:.15g}'.format(x) if pd.notnull(x) else nat_replace)



In [None]:
# Drop the analysis ID column for cleaned_df_5. We LIKELY don't need it.

cleaned_df_5 = cleaned_df_4.drop(columns=['anlys_id'])

In [None]:
# # This is to view how many unique glac_id exist per glac_name.

# cleaned_df_6 = cleaned_df_5.copy()

# # Group by glac_name and count unique glac_id values
# glac_id_counts = cleaned_df_6.groupby('glac_name')['glac_id'].nunique().reset_index()

# # Rename columns for clarity
# glac_id_counts.columns = ['glac_name', 'unique_glac_id_count']

# glac_id_counts

In [None]:
filter_value = 'Yalik Glacier'

filtered_df_1 = cleaned_df_5[cleaned_df_5['glac_name'] == filter_value]

filtered_df_1

In [None]:
# We still have the same number of rows. Check the cell above that lists row count for cleaned_df
row_count = cleaned_df_5.shape[0]
print(f"Number of rows: {row_count}")

In [None]:
# cleaned_df.to_csv('../Resources/cleaned_Canada_Glaciers.csv', index=False) #342 megs

In [None]:
# cleaned_df_2.to_csv('../Resources/cleaned_df_2.csv', index=False) #313 megs

In [None]:
# cleaned_df_3.to_csv('../Resources/cleaned_df_3.csv', index=False) #310 megs

In [None]:
# cleaned_df_4.to_csv('../Resources/cleaned_df_4.csv', index=False) #309 megs

In [None]:
# cleaned_df_5.to_csv('../Resources/cleaned_df_5.csv', index=False) #309 megs

In [None]:
cleaned_df_6 = cleaned_df_5.copy()

cleaned_df_6['first_coordinate'] = cleaned_df_6['Coordinates'].apply(lambda x: x.split(',0')[0])

cleaned_df_6

In [None]:
cleaned_df_6.drop(columns=['Coordinates'], inplace=True)

cleaned_df_6

In [None]:
cleaned_df_7 = cleaned_df_6.copy()

cleaned_df_7[['lng', 'lat']] = cleaned_df_7['first_coordinate'].str.split(',', expand=True)

cleaned_df_7['lat'] = pd.to_numeric(cleaned_df_7['lat'])
cleaned_df_7['lng'] = pd.to_numeric(cleaned_df_7['lng'])

cleaned_df_7

In [None]:
#cleaned_df_7.drop(columns=['first_coordinate'], inplace=True)

avg_lat_lng_df = cleaned_df_7.groupby('glac_name')[['lat', 'lng']].mean().reset_index()

avg_lat_lng_df

avg_lat_lng_df.to_csv('../Resources/glacier_avg_lat_lng.csv', index=False)

cleaned_df_7.to_csv('../Resources/cleaned_df_7.csv', index=False)

In [None]:
# # Define the number of clusters
# num_clusters = 20

# # Initialize KMeans with the number of clusters
# kmeans = KMeans(n_clusters=num_clusters, random_state=0)

# # Fit the KMeans model
# cleaned_df_7[['lat', 'lng']] = cleaned_df_7[['lat', 'lng']].astype(float)
# cleaned_df_7['cluster'] = kmeans.fit_predict(cleaned_df_7[['lat', 'lng']])

# # Calculate the average coordinates for each cluster
# cluster_means = cleaned_df_7.groupby('cluster')[['lat', 'lng']].mean().reset_index()

In [None]:
# print(cluster_means)

In [None]:
# Define the number of clusters
num_clusters = 100

# Initialize KMeans with the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=0)

# Fit the KMeans model
cleaned_df_7[['lat', 'lng']] = cleaned_df_7[['lat', 'lng']].astype(float)
cleaned_df_7['cluster'] = kmeans.fit_predict(cleaned_df_7[['lat', 'lng']])

# Calculate the average coordinates for each cluster
cluster_means = cleaned_df_7.groupby('cluster')[['lat', 'lng']].mean().reset_index()

# Plotting
plt.figure(figsize=(10, 6))

# Plot cluster centers
plt.scatter(cluster_means['lng'], cluster_means['lat'], c='red', s=200, marker='X', label='Centroids')

# Add labels and title
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Cluster Centers of Mean Latitude and Longitude')
plt.legend()

# Show plot
plt.grid(True)
plt.show()

cluster_means.to_csv('../Resources/100_cluster_means.csv')

In [None]:
# Convert cluster means to DataFrame
cluster_means_df = pd.DataFrame(cluster_means, columns=['lat', 'lng'])

# Create a column 'cluster_id' for index purposes starting at 1
cluster_means_df['cluster_id'] = range(1, len(cluster_means_df) + 1)

# Set 'cluster_id' as the index
cluster_means_df.set_index('cluster_id', inplace=True)

kdtree = KDTree(cluster_means_df[['lat', 'lng']].values)

In [None]:
data_coords = cleaned_df_7[['lat', 'lng']].values

distances, indices = kdtree.query(data_coords)

In [None]:
cleaned_df_7['closest_cluster_lat'] = cluster_means_df['lat'].iloc[indices].values
cleaned_df_7['closest_cluster_lng'] = cluster_means_df['lng'].iloc[indices].values

cleaned_df_7

In [None]:
# Filter the DataFrame
filtered_df = cleaned_df_7[cleaned_df_7['area'] != '0']

# Print the filtered DataFrame
filtered_df

In [None]:
# Filter the DataFrame
filtered_df_carrie = cleaned_df_7[cleaned_df_7['glac_name'] == 'Carrie Glacier']

# Print the filtered DataFrame
filtered_df_carrie

In [None]:
cleaned_df_8 = cleaned_df_7.copy()

# cleaned_df_8 = cleaned_df_8[cleaned_df_8['area'] != '0']
cleaned_df_8 = cleaned_df_8[cleaned_df_8['glac_name'] != 'None']

cleaned_df_8

In [None]:
counts = cleaned_df_8['glac_name'].value_counts()

cleaned_df_9 = cleaned_df_8[cleaned_df_8['glac_name'].isin(counts[counts >= 2].index)]

cleaned_df_9

cleaned_df_9.to_csv('../Resources/us_glaciers_data_1.csv')

In [None]:
cleaned_df_10 = cleaned_df_9.copy()

# Convert 'area' to numeric type
cleaned_df_10['area'] = pd.to_numeric(cleaned_df_10['area'])

# Sort by glacier name and time (Unix timestamps)
cleaned_df_11 = cleaned_df_10.sort_values(by=['glac_name', 'anlys_time'])

# Keep only the first occurrence for each timestamp within each glacier
cleaned_df_11 = cleaned_df_11.drop_duplicates(subset=['glac_name', 'anlys_time'], keep='first')

# Find the initial area value for each glac_name
initial_area = cleaned_df_11.groupby('glac_name')['area'].transform('first')

# Convert 'initial_area' to numeric type (if necessary)
initial_area = pd.to_numeric(initial_area)

# Set the initial value to 100
cleaned_df_11['percent_area'] = 100 + (cleaned_df_11['area'] - initial_area) / initial_area * 100




In [None]:
# Calculate counts for each glac_name
counts = cleaned_df_11['glac_name'].value_counts()

# Identify glac_name values that occur more than once
valid_glac_names = counts[counts > 1].index

# Filter DataFrame to keep only rows with glac_name in valid_glac_names
cleaned_df_11 = cleaned_df_11[cleaned_df_11['glac_name'].isin(valid_glac_names)]

cleaned_df_11

In [None]:
rows_with_zero_area = cleaned_df_11[cleaned_df_11['area'] == 0]

rows_with_zero_area