### Importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import lasio

import matplotlib.pyplot as plt
import seaborn as sns

import cufflinks as cf
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objs as go
import plotly as py
cf.go_offline()

from scipy import stats
from scipy.stats import skew
import scipy.signal as signal




# Modelling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import roc_curve
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

#### Read the CSV File

In [2]:
df = pd.read_csv('NFA 2019 public_data.csv', low_memory=False)

In [3]:
df[ 'QScore' ].value_counts() 

3A    51481
2A    10576
2B    10096
1A       16
1B       16
Name: QScore, dtype: int64

In [4]:
df.shape

(72186, 12)

#### Claeaning and Preprocessing

In [5]:
df.isna().sum() 

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

##### check fir non numeric columns

In [6]:
# Check for non-numeric values in the DataFrame
non_numeric_cols = df.select_dtypes(exclude='number').columns.tolist()

if len(non_numeric_cols) > 0:
    print("Non-numeric columns found:")
    print(non_numeric_cols)
else:
    print("No non-numeric values found in the DataFrame.")

Non-numeric columns found:
['country', 'record', 'forest_land', 'QScore']


In [7]:
data = df.copy()

In [8]:
# Fill NaN values with column means
#data = data.fillna(data.mean())

###### Drop Null Values

In [9]:
data = data.dropna() 

In [10]:
data.isna().sum() 

country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

###### Map The QScore column

In [11]:
data.shape

(51713, 12)

In [12]:
data[ 'QScore' ] = data[ 'QScore' ].replace([ '1A' ], '2A' ) 

In [13]:
data.QScore.value_counts() 

3A    51473
2A      240
Name: QScore, dtype: int64

In [14]:
data['QScore'] = data['QScore'].replace({'3A': 0, '2A': 1, '2B': 2, '1B': 3})

#### Feature Eng

In [15]:
# Separate features (X) and target variable (y)
X = data.drop('QScore', axis=1)
y = data['QScore']

##### Split Data into train-test

In [16]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [17]:
# Identify the categorical columns in X_train
categorical_columns = ['country', 'country_code', 'record']

In [18]:
# Create a OneHotEncoder object
encoder = OneHotEncoder(handle_unknown='ignore')

In [19]:
# Fit the encoder on the training data
encoder.fit(X_train[categorical_columns])

OneHotEncoder(handle_unknown='ignore')

In [20]:
X_train.shape

(41370, 11)

In [21]:
y_train.shape

(41370,)

In [22]:
# Transform the categorical columns in X_train and assign to a new DataFrame
X_train_encoded = pd.DataFrame(encoder.transform(X_train[categorical_columns]).toarray(), columns=encoder.get_feature_names_out(categorical_columns))

In [23]:
# Drop the original categorical columns from X_train
X_train = X_train.drop(categorical_columns, axis=1)

In [24]:
# Concatenate the encoded categorical features with the remaining numerical features
#X_train = pd.concat([X_train, X_train_encoded], axis=1)

In [25]:
# Transform the categorical columns in X_test and assign to a new DataFrame
X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_columns]).toarray(), columns=encoder.get_feature_names_out(categorical_columns))

In [26]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [27]:
#import imblearn
#from imblearn.over_sampling import SMOTE

#smote = SMOTE(random_state=1)
#X_train_balanced, y_balanced = smote.fit_resample(X_train, y_train)

In [28]:
non_numeric_df = data.select_dtypes(exclude='number')
print(non_numeric_df)

       country        record  forest_land
0      Armenia    AreaPerCap  0.097188051
1      Armenia     AreaTotHA       334600
2      Armenia  BiocapPerCap  0.084003213
3      Armenia  BiocapTotGHA  289207.1078
4      Armenia  EFConsPerCap     1.26E-06
...        ...           ...          ...
72181    World  BiocapTotGHA   5111762779
72182    World  EFConsPerCap  0.273495416
72183    World  EFConsTotGHA   2042179333
72184    World  EFProdPerCap  0.273495416
72185    World  EFProdTotGHA   2042179333

[51713 rows x 3 columns]


In [29]:
data['forest_land'] = pd.to_numeric(data['forest_land'], errors='coerce')

In [30]:
non_numeric_y_train = y_train[~np.isfinite(y_train)]
print(non_numeric_y_train)

Series([], Name: QScore, dtype: int64)


In [31]:
from imblearn.over_sampling import SMOTE

# Define the SMOTE object
smote = SMOTE(random_state=1)

# Use the fit_resample method to balance the dataset
X_train_balanced, y_balanced = smote.fit_resample(X_train, y_train_encoded)

In [32]:
# Drop the original categorical columns from X_test
X_test = X_test.drop(categorical_columns, axis=1)

In [33]:
# Concatenate the encoded categorical features with the remaining numerical features
X_test = pd.concat([X_test, X_test_encoded], axis=1)

In [34]:
# Re-split the encoded data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [35]:
X_test.shape

(8274, 8)

In [36]:
# Create and train the random forest classifier
model1 = RandomForestClassifier()
model1.fit(X_train, y_train)

RandomForestClassifier()

In [37]:
# Predict the target variable for the test set
y_pred1 = model1.predict(X_test)

In [38]:
# Calculate the accuracy on the test set
accuracy1 = accuracy_score(y_test, y_pred1)

In [39]:
# Print the accuracy
print("Accuracy on the test set:", round(accuracy1, 4))

Accuracy on the test set: 0.9961


In [40]:
print(data.dtypes)

country            object
year                int64
country_code        int64
record             object
crop_land         float64
grazing_land      float64
forest_land       float64
fishing_ground    float64
built_up_land     float64
carbon            float64
total             float64
QScore              int64
dtype: object


In [41]:
print(data['forest_land'].unique())

[9.71880510e-02 3.34600000e+05 8.40032130e-02 ... 5.11176278e+09
 2.73495416e-01 2.04217933e+09]


In [42]:
data['forest_land'].fillna(data['forest_land'].mean(), inplace=True)

##### Resplit train-test 

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [44]:
X_train = data[['year', 'country_code', 'crop_land', 'grazing_land', 'forest_land', 'fishing_ground', 'built_up_land', 'carbon', 'total']]
y_train = data['QScore']

#### Model 2, XGBclassifier

In [45]:
model2 = XGBClassifier()
model2.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [46]:
# Create a placeholder DataFrame with the missing feature
missing_feature = pd.DataFrame({'country_code': [0] * len(X_test)})

# Concatenate the missing feature DataFrame with X_test
X_test = pd.concat([X_test, missing_feature], axis=1)

In [47]:
X_test['forest_land'] = pd.to_numeric(X_test['forest_land'], errors='coerce')

In [48]:
X_test = X_test.reindex(columns=X_train.columns)

In [49]:
y_pred2 = model2.predict(X_test)

In [50]:
y_test

14288    0
15652    0
30132    0
71722    0
6827     0
        ..
48667    0
897      0
25860    0
49386    0
69470    0
Name: QScore, Length: 6620, dtype: int64

In [51]:
y_pred2.shape

(12538,)

In [52]:
accuracy2 = accuracy_score(y_test, y_pred2)

ValueError: Found input variables with inconsistent numbers of samples: [6620, 12538]

In [None]:
print("Accuracy on the XGB:", round(accuracy2, 4))

#### Model3 LGBM Classifier

In [53]:
model3 = LGBMClassifier()
model3.fit(X_train, y_train)

LGBMClassifier()

In [54]:
y_pred3 = model3.predict(X_test)

In [55]:
accuracy3 = accuracy_score(y_test, y_pred3)

ValueError: Found input variables with inconsistent numbers of samples: [6620, 12538]

In [None]:
print("Accuracy on the test set:", round(accuracy3, 4))

In [56]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [57]:
model4 = ExtraTreesClassifier(random_state=1)

In [59]:
random_search = RandomizedSearchCV(estimator=model4, param_distributions=param_grid, 
                                   cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, 
                                   verbose=1, random_state=1)
random_search.fit(X, y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




50 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ancep\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ancep\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py", line 327, in fit
    X, y = self._validate_data(
  File "C:\Users\ancep\anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\ancep\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 964, in check_X_y
    X = check_array(
  File "C:\Users\an

ValueError: could not convert string to float: 'Armenia'

In [60]:
best_params = random_search.best_params_
best_score = random_search.best_score_

In [61]:
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None}
Best Score: nan


In [62]:
model4 = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2, 
                                     min_samples_leaf=1, max_features='auto', random_state=1)

In [63]:
model4.fit(X_train, y_train)

ExtraTreesClassifier(random_state=1)

In [64]:
y_pred4 = model4.predict(X_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
accuracy = et_classifier.score(y_pred4, y_test)

In [None]:
print("Accuracy of the new optimal model:", accuracy)