In [33]:
import pandas as pd
import numpy as np

In [None]:
%pip install xgboost



In [34]:
import xgboost as xgb

In [35]:
# Read data
df = pd.read_csv('clean_enviro_data.csv')

In [None]:
df

In [36]:
#features
features = ['age', 'sex', 'married', 'children', 'highest_qual','income','party','voted']
y = 'cc_threat'

In [37]:
#traning and test sets
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [38]:
xgb_model = xgb.XGBClassifier(objective='binary:logistic')

In [39]:
xgb_model.fit(df_train[features], df_train[y])

In [43]:
# Import from sklearn
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# evaluate accuracy
accuracy=accuracy_score(df_test[y], y_pred)
print(f'Accuracy is {accuracy}')

# predictions
y_pred = xgb_model.predict(df_test[features])

# precision
precision = precision_score(df_test[y], y_pred)
print(f'Precision is {precision}')

# recall
recall = recall_score(df_test[y], y_pred)
print(f'Recall is {recall}')

# F1 score
f1 = f1_score(df_test[y], y_pred)
print(f'F1 score is {f1}')

Accuracy is 0.8408163265306122
Precision is 0.9078341013824884
Recall is 0.9120370370370371
F1 score is 0.9099307159353348


###Adjusting the hyperparameters:


In [44]:
# Let's lower the learning rate
xgb_model = xgb.XGBClassifier(objective='binary:logistic', colsample_bytree = 0.5118586400408849, learning_rate = 0.05375221185047551, max_depth = 7, min_child_weight = 12, n_estimators = 744, subsample = 0.5426146441765256)

# Fit the model
xgb_model.fit(df_train[features], df_train[y])

# Make predictions
y_pred = xgb_model.predict(df_test[features])

#calculate accuracy
accuracy=accuracy_score(df_test[y], y_pred)
print(f'Accuracy is {accuracy}')

# Calculate precision
precision = precision_score(df_test[y], y_pred)
print(f'Precision is {precision}')

# Calculate recall
recall = recall_score(df_test[y], y_pred)
print(f'Recall is {recall}')

# Calculate F1 score
f1 = f1_score(df_test[y], y_pred)
print(f'F1 score is {f1}')

Accuracy is 0.8938775510204081
Precision is 0.8925619834710744
Recall is 1.0
F1 score is 0.9432314410480349


###**Bayesian Optimisation**

In [23]:
%pip install hyperopt



In [24]:
from hyperopt import fmin, tpe, hp, STATUS_OK
from hyperopt.pyll.base import scope # for controlling data types

In [25]:
#scope.int specifies it's an integer not a float, learning rate is a float
space = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 15, 1)),
    'min_child_weight':  scope.int(hp.quniform('min_child_weight', 1, 15, 1)),
    'learning_rate': hp.loguniform('learning_rate', -5, -2),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1),
    'n_estimators':  scope.int(hp.quniform('n_estimators', 100, 1000, 1))
}

In [26]:
# Define the objective function to minimize, -score means to minimise score
def objective(params):
    xgb_model = xgb.XGBClassifier(objective='binary:logistic',**params)
    xgb_model.fit(df_train[features], df_train[y])
    y_pred = xgb_model.predict(df_test[features])
    #score = accuracy_score(y_test, y_pred)
    score = f1_score(df_test[y], y_pred)
    return {'loss': -score, 'status': STATUS_OK}

In [27]:
best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100)
print("Best parameters: ", best_params)

100%|██████████| 100/100 [00:38<00:00,  2.58trial/s, best loss: -0.9432314410480349]
Best parameters:  {'colsample_bytree': 0.5118586400408849, 'learning_rate': 0.05375221185047551, 'max_depth': 7.0, 'min_child_weight': 12.0, 'n_estimators': 744.0, 'subsample': 0.5426146441765256}


In [28]:
for key in best_params:
    if key in ['max_depth', 'min_child_weight', 'n_estimators']:
        best_params[key] = int(best_params[key])

print("Best parameters: ", best_params)

Best parameters:  {'colsample_bytree': 0.5118586400408849, 'learning_rate': 0.05375221185047551, 'max_depth': 7, 'min_child_weight': 12, 'n_estimators': 744, 'subsample': 0.5426146441765256}


###**Xgboost cross-validation**

In [None]:
from sklearn.model_selection import KFold

In [None]:
# Define the classifier
clf = xgb.XGBClassifier(objective='binary:logistic', **best_params)

# Get the k folds
kf = KFold(n_splits=10, shuffle = True, random_state=50)

# Loop over folds and calculate performance measure
results = []
for k, (train_idx, test_idx) in enumerate(kf.split(df[features])):
    # Fit model
    cfit = clf.fit(df[features].iloc[train_idx], df[y].iloc[train_idx])

    # Get predictions
    y_pred = cfit.predict(df[features].iloc[test_idx])

    # Write results
    result = {'fold': k,
              'precision': precision_score(df[y].iloc[test_idx], y_pred),
              'recall': recall_score(df[y].iloc[test_idx], y_pred),
              'f1': f1_score(df[y].iloc[test_idx], y_pred)}
    # If we want to monitor progress
    print(result)

    results.append(result)

{'fold': 0, 'precision': 0.8524590163934426, 'recall': 0.9904761904761905, 'f1': 0.9162995594713657}
{'fold': 1, 'precision': 0.8677685950413223, 'recall': 1.0, 'f1': 0.9292035398230089}
{'fold': 2, 'precision': 0.859504132231405, 'recall': 0.9904761904761905, 'f1': 0.9203539823008849}
{'fold': 3, 'precision': 0.8442622950819673, 'recall': 1.0, 'f1': 0.9155555555555556}
{'fold': 4, 'precision': 0.8099173553719008, 'recall': 1.0, 'f1': 0.8949771689497716}
{'fold': 5, 'precision': 0.875, 'recall': 0.9813084112149533, 'f1': 0.9251101321585904}
{'fold': 6, 'precision': 0.8032786885245902, 'recall': 1.0, 'f1': 0.8909090909090909}
{'fold': 7, 'precision': 0.8429752066115702, 'recall': 1.0, 'f1': 0.914798206278027}
{'fold': 8, 'precision': 0.8823529411764706, 'recall': 0.9722222222222222, 'f1': 0.9251101321585902}
{'fold': 9, 'precision': 0.8677685950413223, 'recall': 1.0, 'f1': 0.9292035398230089}


In [None]:
# results
results_df = pd.DataFrame(results)

In [None]:
results_df.to_csv('TUNEDresultsmodel2.csv')

In [None]:
# Average precision
np.mean([x['precision'] for x in results])
print(f'Average precision is {np.mean([x["precision"] for x in results])}')

# Average recall
np.mean([x['recall'] for x in results])
print(f'Average recall is {np.mean([x["recall"] for x in results])}')

# Average F1
np.mean([x['f1'] for x in results])
print(f'Average F1 is {np.mean([x["f1"] for x in results])}')

Average precision is 0.8505286825473991
Average recall is 0.9934483014389557
Average F1 is 0.9161520907427894


##**Production Stage**
####Putting Model 2 into production

In [None]:
# Define the classifier
clf = xgb.XGBClassifier(objective='binary:logistic', **best_params)

# Fit on all data
cfit = clf.fit(df[features], df[y])

# Save the model
import pickle
pickle.dump(cfit, open('xgb_model2.pkl', 'wb'))

In [None]:
loaded_model = pickle.load(open('xgb_model2.pkl', 'rb'))

In [None]:
df.head(1)

Unnamed: 0,Sserial,age,sex,married,children,highest_qual,income,party,voted,cc_threat
0,290001,3,1,1,1,3,3,2,1,0


In [None]:
# Create a new example observation as a dictionary with the variable names as keys
new_obs = { 'age': 7,
            'sex': 1,
            'married': 2,
            'children': 1,
            'highest_qual': 4,
            'income': 1,
            'party': 1,
            'voted': 2}

# Convert to a dataframe
df_new_obs = pd.DataFrame([new_obs])

# Make a prediction
prob = loaded_model.predict_proba(df_new_obs[features])
print(f'Probability of believe climate change is a threat {prob[0][1]}')

Probability of believe climate change is a threat 0.5625547766685486
