In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin


class VegetationIndexTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, nir_col='nir_p50', red_col='red_p50', blue_col='blue_p50'):
        self.nir_col = nir_col
        self.red_col = red_col
        self.blue_col = blue_col
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Apply scale factor
        X = X.copy()
        for col in [self.nir_col, self.red_col, self.blue_col]:
            X[col] = X[col] * 0.0001
        
        # Calculate NDVI
        ndvi = (X[self.nir_col] - X[self.red_col]) / (X[self.nir_col] + X[self.red_col])
        
        # Calculate EVI
        evi = 2.5 * ((X[self.nir_col] - X[self.red_col]) / 
                     (X[self.nir_col] + 6 * X[self.red_col] - 7.5 * X[self.blue_col] + 1))
        
        # Calculate SAVI (assuming L=0.5)
        L = 0.5
        savi = ((X[self.nir_col] - X[self.red_col]) / 
                (X[self.nir_col] + X[self.red_col] + L)) * (1 + L)
        
        return pd.DataFrame({
            'NDVI': ndvi,
            'EVI': evi,
            'SAVI': savi
        })


def create_vegetation_index_pipeline():

    feature_columns = ['blue_p50', 'green_p50', 'red_p50', 'nir_p50', 'swir1_p50', 'swir2_p50', 'VV_p50', 'VH_p50']
    
    vegetation_pipeline = Pipeline([
        ('features', ColumnTransformer([
            ('pass_through', 'passthrough', feature_columns),
            ('veg_indices', VegetationIndexTransformer(), ['nir_p50', 'red_p50', 'blue_p50'])
        ])),
        ('scaler', StandardScaler())
    ])
    
    return vegetation_pipeline

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib

data = pd.read_csv('TrainingData.csv')  


spain_coords = {'lat_min': 36.0, 'lat_max': 43.0, 'lon_min': -9.0, 'lon_max': 3.0}
kenya_coords = {'lat_min': -4.5, 'lat_max': 5.0, 'lon_min': 34.0, 'lon_max': 42.0}
vnm_coords = {'lat_min': 8.0, 'lat_max': 24.0, 'lon_min': 102.0, 'lon_max': 110.0}

# Function to filter data for a specific country
def filter_data(data, coords):
    return data[(data['lat'] >= coords['lat_min']) & (data['lat'] <= coords['lat_max']) & 
                (data['lon'] >= coords['lon_min']) & (data['lon'] <= coords['lon_max'])]


data_spain = filter_data(data, spain_coords)
data_kenya = filter_data(data, kenya_coords)
data_vnm = filter_data(data, vnm_coords)

# Combine the filtered data
filtered_data = pd.concat([data_spain, data_kenya, data_vnm])

# Define features and target
features = ['lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50', 're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50', 'VV_p50', 'VH_p50']
target = 'TARGET'

X = filtered_data[features]
y = filtered_data[target]


X_Train,X_val,y_Train,y_val = train_test_split(X,y,test_size=0.2,random_state=42)


feature_pipeline = create_vegetation_index_pipeline()

full_pipeline = Pipeline([
        ('features', feature_pipeline),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])


full_pipeline.fit(X_train,y_train)

kf = KFold(n_splits=10, shuffle=True, random_state=42)


cv_scores = cross_val_score(full_pipeline, X_train, y_train, cv=kf, scoring='accuracy')


print("Cross-Validation Accuracy Scores: ", cv_scores)
print("Mean Cross-Validation Accuracy: ", cv_scores.mean())
print("Standard Deviation of Cross-Validation Accuracy: ", cv_scores.std())


model.fit(X, y)

val_score = full_pipeline.score(X_val, y_val)
print("Validation Accuracy: ", val_score)




In [4]:
import sklearn.model_selection

from sklearn.model_selection import train_test_split

X_Train_1,X_val_1,y_Train_1,y_val_1 = train_test_split(X_train_processed_1,y_train_1,test_size=0.2,random_state=42)
X_Train_2,X_val_2,y_Train_2,y_val_2 = train_test_split(X_train_processed_2,y_train_2,test_size=0.2,random_state=42)
X_Train_3,X_val_3,y_Train_3,y_val_3 = train_test_split(X_train_processed_3,y_train_3,test_size=0.2,random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

forest_clf = XGBClassifier()
forest_clf.fit(X_Train_1,y_Train_1)
# forest_clf.fit(X_Train_2,y_Train_2)
# forest_clf.fit(X_Train_3,y_Train_3)

In [20]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier()
X_val_transformed = pipeline.transform(X_val_1)
forest_clf.fit(X_train_transformed,y_Train_1)
y_pred_forest_1 = forest_clf.predict(X_val_transformed)

# y_pred_forest_1=forest_clf.predict(X_val_transformed)
# y_pred_forest_2=forest_clf.predict(X_val_2)
# y_pred_forest_3=forest_clf.predict(X_val_3)

ValueError: Found input variables with inconsistent numbers of samples: [998, 798]

In [None]:
from sklearn.metrics import precision_score,recall_score

precision=precision_score(y_val,y_pred_forest)
recall=recall_score(y_val,y_pred_forest)

from sklearn.metrics import confusion_matrix

confusion=confusion_matrix(y_val,y_pred_forest)


print(f'Precision score: {precision} \nRecall score: {recall}')
confusion


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

acc_1=accuracy_score(y_val_1, y_pred_forest_1)
# acc_2=accuracy_score(y_val_2, y_pred_forest_2)
# acc_3=accuracy_score(y_val_3, y_pred_forest_3)

In [None]:
print(f'Accuracy for Kenya:{acc_1}')

In [None]:
import pandas as pd

def add_id_column(df, file_name):
  """Adds an ID column based on the file name and index."""
  file_prefix = file_name.split('_')[0]
  df['ID'] = df.index.map(lambda x: f"{file_prefix}_{x+1}")
  return df

t1 = pd.read_csv('Kenya_testing.csv')
t2 = pd.read_csv('Spain_validation.csv')
t3 = pd.read_csv('VNM_testing.csv')


test1 = add_id_column(t1, 'Kenya')  
test2 = add_id_column(t2, 'Spain')
test3 = add_id_column(t3, 'VNM')

t1 = pd.DataFrame(test1)
t2 = pd.DataFrame(test2)
t3 = pd.DataFrame(test3)



In [None]:
sample_submission = pd.read_csv('SampleSubmission.csv')
print(sample_submission.head())
test = pd.concat([t1, t2, t3], ignore_index=True, join='outer')
Test = pd.DataFrame(test)

merged_data = pd.merge(sample_submission, Test, on='ID', how='left')


In [None]:
merged_data.head()

In [None]:
testdata = merged_data.drop(columns=['ID','TARGET'])

test_pred = forest_clf.predict(testdata)

# Create submission DataFrame
submission_pred = pd.DataFrame({'ID': sample_submission['ID'], 'TARGET': test_pred})

# Save to CSV
submission_pred.to_csv('pred_submission.csv', index=False)

submission_pred.head()
