In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [28]:
df = pd.read_csv("../data/dataset/train_data/metadata.csv")
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df

Unnamed: 0,date,id_coord,plume,set,lat,lon,coord_x,coord_y,path
0,2023-02-23,id_6675,yes,train,31.528750,74.330625,24,47,images/plume/20230223_methane_mixing_ratio_id_...
1,2023-01-03,id_2542,yes,train,35.538000,112.524000,42,37,images/plume/20230103_methane_mixing_ratio_id_...
2,2023-03-01,id_6546,yes,train,21.060000,84.936667,58,15,images/plume/20230301_methane_mixing_ratio_id_...
3,2023-02-25,id_6084,yes,train,26.756667,80.973333,28,62,images/plume/20230225_methane_mixing_ratio_id_...
4,2023-01-05,id_2012,yes,train,34.800000,40.770000,59,44,images/plume/20230105_methane_mixing_ratio_id_...
...,...,...,...,...,...,...,...,...,...
425,2023-03-02,id_6658,no,train,30.965619,34.541283,39,36,images/no_plume/20230302_methane_mixing_ratio_...
426,2023-02-18,id_4690,no,train,35.950275,40.267652,29,28,images/no_plume/20230218_methane_mixing_ratio_...
427,2023-02-13,id_2519,no,train,49.963801,6.016938,23,10,images/no_plume/20230213_methane_mixing_ratio_...
428,2023-02-13,id_5510,no,train,32.713854,44.609398,55,54,images/no_plume/20230213_methane_mixing_ratio_...


In [40]:

df = pd.read_csv("../data/dataset/train_data/metadata.csv")
result_df = df[['lat', 'lon', 'plume']].drop_duplicates()
print(result_df)


           lat         lon plume
0    31.528750   74.330625   yes
1    35.538000  112.524000   yes
2    21.060000   84.936667   yes
3    26.756667   80.973333   yes
4    34.800000   40.770000   yes
..         ...         ...   ...
374  39.180143   -8.812055    no
376  49.962191   16.749235    no
412  18.102717  -76.702893    no
420  17.913044  -89.079593    no
427  49.963801    6.016938    no

[101 rows x 3 columns]


In [41]:
from geopy.geocoders import Nominatim

# Initialize a geocoder (Nominatim)
geolocator = Nominatim(user_agent="geoapiExercises")

# Function to get city name based on latitude and longitude
def get_city_name(lat, lon):
    location = geolocator.reverse(f"{lat}, {lon}")
    address = location.raw.get("address", {})
    city = address.get("city", address.get("town", address.get("village", "N/A")))
    return city

# Apply the function to the DataFrame to get city names
df['city'] = df.apply(lambda row: get_city_name(row['lat'], row['lon']), axis=1)

# Save the 'lat', 'lon', and 'city' columns as a CSV file
df[['lat', 'lon', 'city']].to_csv('locations_with_cities.csv', index=False)


In [22]:
# Find pairs with different "plume" values over time
unique_location_pairs = []
for name, group in df.groupby(['lat', 'lon']):
    if len(group['plume'].unique()) > 1:
        unique_location_pairs.append(group)

# Sort by the number of unique "plume" values
unique_location_pairs.sort(key=lambda x: len(x['plume'].unique()), reverse=True)

# Take the top 10 pairs
top_10_unique_location_pairs = unique_location_pairs[:10]

# Create plots for each of the top 10 pairs
for location_df in top_10_unique_location_pairs:
    lat = location_df['lat'].values[0]
    lon = location_df['lon'].values[0]
    
    # Convert 'plume' column to 1 for 'yes' and 0 for 'no'
    location_df['plume'] = location_df['plume'].apply(lambda x: 1 if x == 'yes' else 0)
    
    # Sort by date
    location_df['date'] = pd.to_datetime(location_df['date'])
    location_df = location_df.sort_values(by='date')
    
    # Create a graph over time
    plt.figure(figsize=(10, 4))
    plt.plot(location_df['date'], location_df['plume'], marker='o')
    plt.title(f'Plume over Time for Latitude {lat} and Longitude {lon}')
    plt.xlabel('Date')
    plt.ylabel('Plume (1 for yes, 0 for no)')
    plt.grid()
    plt.show()

In [23]:
df_test = pd.read_csv("../data/dataset/test_data/metadata.csv")
df_test['date'] = pd.to_datetime(df_test['date'], format='%Y%m%d')
df_test

Unnamed: 0,date,id_coord,lat,lon,coord_x,coord_y
0,2023-02-13,id_6148,23.437500,90.645000,54,10
1,2023-02-06,id_3348,33.513333,-87.203333,31,55
2,2023-01-21,id_3733,26.424286,80.284286,33,44
3,2023-02-04,id_5491,30.246667,71.483333,59,59
4,2023-01-30,id_4287,23.763333,86.396667,46,46
...,...,...,...,...,...,...
104,2023-01-11,id_5510,32.713854,44.609398,55,54
105,2023-01-22,id_5510,32.713854,44.609398,55,54
106,2023-01-29,id_4690,35.950275,40.267652,29,28
107,2023-04-04,id_5510,32.713854,44.609398,55,54


In [24]:
# Extract unique pairs of (latitude, longitude) from both datasets
train_unique_pairs = set(zip(df['lat'], df['lon']))
test_unique_pairs = set(zip(df_test['lat'], df_test['lon']))

# Identify new pairs in the test dataset compared to the train dataset
new_pairs_in_test = test_unique_pairs - train_unique_pairs

# Print the new pairs
print("New pairs in the test dataset compared to the train dataset:")
for lat, lon in new_pairs_in_test:
    print(f"Latitude: {lat}, Longitude: {lon}")

New pairs in the test dataset compared to the train dataset:
Latitude: 12.975, Longitude: 80.21000000000001
Latitude: 23.56, Longitude: 86.33500000000001
Latitude: 18.40554875083402, Longitude: -73.80527937274765
Latitude: 36.565, Longitude: 109.73
Latitude: 50.22, Longitude: 18.66
Latitude: 36.30500000000001, Longitude: 112.86


In [27]:
# Extract unique pairs of (latitude, longitude) from the test dataset
unique_pairs = set(zip(df_test['lat'], df_test['lon']))
print(len(unique_pairs))

# # Print the unique pairs
# print("Unique pairs of (latitude, longitude) in the test dataset:")
# for lat, lon in unique_pairs:
#     print(f"Latitude: {lat}, Longitude: {lon}")

55
