<a href="https://colab.research.google.com/github/dolphinxyz/mlModel/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import json
import pandas as pd
import numpy as np
from numpy import absolute
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from hyperopt import hp, tpe, fmin
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
numerical_columns = [
  'att_age_13-17', 'att_age_18-24', 'att_age_25-34',
  'att_age_35-44', 'att_age_45-55', 'att_age_55+', 'Views_Infludata',
  'Engagement_Infludata', 'Quality score', 'Gender_Male%_Infludata',
  'Gender_Female%_Infludata', 'Followers', 'Posts', 'AudienceCountryGermany']
categorical_columns = [
  'InfluencerManager', 'size', 'affinity', 'gender', 'att_cat_empowerment',
  'att_cat_family', 'att_cat_interior', 'att_cat_lifestyle', 'att_cat_medical',
  'att_cat_sales', 'att_cat_skincare', 'att_cat_beauty-and-makeup',
  'att_cat_fashion', 'att_cat_healthy', 'att_cat_personality',
  'att_cat_international', 'att_cat_slow-aging', 'att_cat_natural',
  'att_cat_fake', 'status', 'IsAudienceCountryGermany>60%', 'IsFemale>60%',
  'IsRealAccount>60%']
model_columns = ['att_age_13-17', 'att_age_18-24', 'att_age_25-34',
  'att_age_35-44', 'att_age_45-55', 'att_age_55+', 'Views_Infludata',
  'Engagement_Infludata', 'Quality score', 'Posts', 'InfluencerManager', 'size',
  'affinity', 'gender', 'att_cat_empowerment', 'att_cat_family', 'att_cat_interior',
  'att_cat_lifestyle', 'att_cat_medical', 'att_cat_sales', 'att_cat_skincare',
  'att_cat_beauty-and-makeup', 'att_cat_fashion', 'att_cat_healthy',
  'att_cat_personality', 'att_cat_international', 'att_cat_slow-aging',
  'att_cat_natural', 'att_cat_fake', 'status', 'IsAudienceCountryGermany>60%',
  'IsFemale>60%', 'IsRealAccount>60%']

In [3]:
def preprocess_data(df):
    df["Orders/Followers"] = df["Orders"] / (df["Followers"] / 1000)
    df["AudienceCountryGermany"] =  df["Audience Analysis: Country"].apply(lambda x: re.search(r'Germany: (.*?)%', str(x)).group(1))
    df["AudienceCountryGermany"] = df["AudienceCountryGermany"].astype(float)
    df["IsAudienceCountryGermany>60%"] = df["AudienceCountryGermany"] >= 60
    df["IsFemale>60%"] = df["Gender_Female%_Infludata"] >= 60
    df["IsRealAccount>60%"] = df["Audience Analysis: Real Accounts (%)"] >= 60

def manage_missing_data(df):
    for numerical_column in numerical_columns:
        if df[numerical_column].isnull().values.any():
            df[numerical_column + '_isnull'] = np.where(df[numerical_column].isnull(), '1', '0')
            df[numerical_column].fillna(value=df[numerical_column].median(), inplace=True)
    for categorical_column in categorical_columns:
        df[categorical_column].fillna('NULL', inplace=True)

def standardize_data(df):
    min_max_scaler = preprocessing.MinMaxScaler()
    df[numerical_columns] = pd.DataFrame(min_max_scaler.fit_transform(df[numerical_columns]))

def convert_categorical_data(df):
    for column in categorical_columns:
        df[column] = LabelEncoder().fit_transform(df[column].astype(str))
        df[column] = df[column].astype('category')

In [4]:
goal = ["Orders/Followers"]
input_df = pd.read_csv("InfluencerProfiles.csv")
preprocess_data(input_df)
manage_missing_data(input_df)
standardize_data(input_df)
convert_categorical_data(input_df)

In [5]:
temp_df = input_df
for column in categorical_columns:
  temp_df[column] = LabelEncoder().fit_transform(temp_df[column].astype(float))
  temp_df[column] = temp_df[column].astype('float')

Y = temp_df[model_columns]
X = temp_df[goal]

In [6]:
x_train, x_valid, y_train, y_valid = train_test_split(Y, X, test_size=0.1, random_state=1)

In [None]:
space = {
  'n_estimators':hp.quniform('n_estimators', 100, 10000, 1000),
  'gamma':hp.uniform('gamma', 0.01, 0.1),
  'learning_rate':hp.uniform('learning_rate', 0.00001, 0.1),
  'max_depth':hp.quniform('max_depth', 3,7,1),
  'subsample':hp.uniform('subsample', 0.10, 0.98),
  'colsample_bytree':hp.uniform('colsample_bytree', 0.10, 0.98),
  'colsample_bylevel':hp.uniform('colsample_bylevel', 0.10, 0.98),
  'reg_lambda': hp.uniform('reg_lambda', 1, 50)
}

def objective(params):
  params = {
    'n_estimators': int(params['n_estimators']),
    'gamma': params['gamma'],
    'learning_rate': params['learning_rate'],
    'max_depth': int(params['max_depth']),
    'subsample': params['subsample'],
    'colsample_bytree': params['colsample_bytree'],
    'colsample_bylevel': params['colsample_bylevel'],
    'reg_lambda': params['reg_lambda']}
  xb_a = XGBRegressor(**params)
  score = cross_val_score(xb_a, x_train, y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1).mean()
  return -score

best = fmin(fn=objective, space=space, max_evals=20, rstate=np.random.RandomState(1), algo=tpe.suggest)

In [8]:
model = XGBRegressor(
  random_state=0,
  n_estimators=int(best['n_estimators']),
  colsample_bytree= best['colsample_bytree'],
  gamma= best['gamma'],
  learning_rate= best['learning_rate'],
  max_depth= int(best['max_depth']),
  subsample= best['subsample'],
  colsample_bylevel= best['colsample_bylevel'],
  reg_lambda= best['reg_lambda'],
  objective ="reg:squarederror"
)

In [None]:
model.fit(x_train, y_train)

In [None]:
train_score = model.score(x_train, y_train)
print("Train score {}".format(train_score))

vali_score = model.score(x_valid, y_valid)
print("Validation score {}".format(vali_score))

# cv = RepeatedKFold(n_splits=2, n_repeats=2, random_state=1)
# scores = cross_val_score(model, Y, X, cv=cv, error_score="raise", scoring="neg_mean_absolute_error")
# print("Mean Absolute error {}".format(absolute(scores.mean())))

In [11]:
y_pred = model.predict(x_valid)
predictions = [value for value in y_pred]

In [None]:
# evaluate predictions
rmse = np.sqrt(mean_squared_error(y_valid.values, predictions))
print("RMSE {}".format(rmse))

In [13]:
pred = pd.DataFrame(y_pred)
pred.columns = ["y_prediction"]
y_valid.reset_index(inplace=True)
df2 = pred.join(y_valid)
comparison = df2[["y_prediction", "Orders/Followers"]]

In [None]:
Y["size"].unique()

In [None]:
for i in range(0, 10):
  temp_value = i/10 + 0.1
  pred_row_df = Y.describe()[Y.describe().index.isin(["mean"])]
  pred_row_df["Posts"] = temp_value
  pred = model.predict(pred_row_df)
  print("Post value", temp_value)
  print("\tPrediction", pred[0])

In [None]:
for i in range(0, 4):
  temp_value = i + 0.1
  pred_row_df = Y.describe()[Y.describe().index.isin(["mean"])]
  pred_row_df["size"] = temp_value
  pred = model.predict(pred_row_df)
  print("size value", temp_value)
  print("\tPrediction", pred[0])