In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib


data = pd.read_csv('TrainingData.csv')  

# Define geographic coordinates for Spain, Kenya, and Vietnam
spain_coords = {'lat_min': 36.0, 'lat_max': 43.0, 'lon_min': -9.0, 'lon_max': 3.0}
kenya_coords = {'lat_min': -4.5, 'lat_max': 5.0, 'lon_min': 34.0, 'lon_max': 42.0}
vnm_coords = {'lat_min': 8.0, 'lat_max': 24.0, 'lon_min': 102.0, 'lon_max': 110.0}

# Function to filter data for a specific country
def filter_data(data, coords):
    return data[(data['lat'] >= coords['lat_min']) & (data['lat'] <= coords['lat_max']) & 
                (data['lon'] >= coords['lon_min']) & (data['lon'] <= coords['lon_max'])]

# Filter data for each country
data_spain = filter_data(data, spain_coords)
data_kenya = filter_data(data, kenya_coords)
data_vnm = filter_data(data, vnm_coords)

# Combine the filtered data
filtered_data = pd.concat([data_spain, data_kenya, data_vnm])

# Define features and target
features = ['lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50', 'VV_p50', 'VH_p50']
target = 'TARGET'

X = filtered_data[features]
y = filtered_data[target]

model = RandomForestClassifier(n_estimators=100, random_state=42)


kf = KFold(n_splits=10, shuffle=True, random_state=42)


cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# Print cross-validation results
print("Cross-Validation Accuracy Scores: ", cv_scores)
print("Mean Cross-Validation Accuracy: ", cv_scores.mean())
print("Standard Deviation of Cross-Validation Accuracy: ", cv_scores.std())

# Train the model on the entire dataset
model.fit(X, y)

# Save the model
# joblib.dump(model, 'plastic_cover_classifier.pkl')
# joblib.dump(label_encoder, 'label_encoder.pkl')

# Predict on a hold-out test set (if available) or new data
# y_pred = model.predict(X_test)  # Uncomment and replace X_test with actual test data if available
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
import pandas as pd

def add_id_column(df, file_name):
  """Adds an ID column based on the file name and index."""
  file_prefix = file_name.split('_')[0]
  df['ID'] = df.index.map(lambda x: f"{file_prefix}_{x+1}")
  return df

t1 = pd.read_csv('Kenya_testing.csv')
t2 = pd.read_csv('Spain_validation.csv')
t3 = pd.read_csv('VNM_testing.csv')


test1 = add_id_column(t1, 'Kenya')  
test2 = add_id_column(t2, 'Spain')
test3 = add_id_column(t3, 'VNM')

t1 = pd.DataFrame(test1)
t2 = pd.DataFrame(test2)
t3 = pd.DataFrame(test3)



sample_submission = pd.read_csv('SampleSubmission.csv')
print(sample_submission.head())
test = pd.concat([t1, t2, t3], ignore_index=True, join='outer')
Test = pd.DataFrame(test)

merged_data = pd.merge(sample_submission, Test, on='ID', how='left')



testdata = merged_data.drop(columns=['ID','TARGET'])

test_pred = model.predict(testdata)

# Create submission DataFrame
submission_pred = pd.DataFrame({'ID': sample_submission['ID'], 'TARGET': test_pred})

# Save to CSV
submission_pred.to_csv('pred_submission_2.csv', index=False)

submission_pred.head()

