<a href="https://colab.research.google.com/github/dkurbatovv/Python/blob/main/Adult_LinReg_and_RandomForest(XGBoost).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score, log_loss, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression

In [None]:
df = pd.read_csv('adult_train.csv')

In [None]:
df.head()

In [None]:
df = df.drop('Education', axis = 1)

In [None]:
df.head()

In [None]:
df.dropna()

In [None]:
df.info()

In [None]:
for col in ['Workclass', 'Occupation', 'Country']:
  df[col].fillna(df[col].mode()[0], inplace = True)

In [None]:
df.info()

In [None]:
for column in df:
  unique_val = np.unique(df[column])
  nr_val = len(unique_val)
  if nr_val < 12:
    print('The number of values for feature {} : {} -- :{}'.format(column, nr_val, unique_val))
  else:
    print('The number of values for feature {} : {}'.format(column, nr_val)) 

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(x = 'Target', data = df)

In [None]:
df['Target'].value_counts(normalize = True)

In [None]:
le = LabelEncoder()
df['Race'] = le.fit_transform(df['Race'])
df['Sex'] = le.fit_transform(df['Sex'])

In [None]:
df.head()

In [None]:
X = df.drop('Age', axis = 1)
y = df['Age']

In [None]:
new_df = pd.get_dummies(df, columns = ['Workclass', 'Martial_Status', 'Occupation', 'Relationship', 'Country'])

In [None]:
new_df.head()

In [None]:
new_df['Target'] = le.fit_transform(df['Target'])

In [None]:
new_df.head()

In [None]:
plt.figure(figsize = (12,10))
sns.countplot(data = new_df, x = 'Sex', hue = 'Target')

In [None]:
plt.figure(figsize = (8,6))
sns.countplot(data = new_df, x = 'Race', hue = 'Target', palette = 'Greens_r') 

In [None]:
plt.figure(figsize = (8,6))
sns.countplot(data = df, x = 'Relationship', hue = 'Target', palette = 'Greens_r') 

In [None]:
corr = df.corr()
plt.figure(figsize = (12,10))
sns.heatmap(corr, annot = True)

In [None]:
X = new_df.drop('Target', axis = 1)
y = new_df['Target']

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
model = LogisticRegression(random_state=10, solver = 'lbfgs')

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_train)

In [None]:
# Accuracy on Train
print("The Training Accuracy is: ", model.score(X_train, y_train))

# Accuracy on Test
print("The Testing Accuracy is: ", model.score(X_test, y_test))


# Classification Report
print(classification_report(y_train, y_pred))

In [None]:
def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, cmap="YlGnBu", xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':50})
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:

cm = confusion_matrix(y_train, y_pred)
cm_norm = cm / cm.sum(axis=1).reshape(-1,1)

plot_confusion_matrix(cm_norm, classes = model.classes_, title='Confusion matrix')

In [None]:
pred_proba = model.predict_proba(X_train)

In [None]:
# Running Log loss on training
print("The Log Loss on Training is: ", log_loss(y_train, pred_proba))

# Running Log loss on testing
pred_proba_t = model.predict_proba(X_test)
print("The Log Loss on Testing Dataset is: ", log_loss(y_test, pred_proba_t))

**Hyper Parameter Tuning**

In [None]:
plt.plot(np.geomspace(1e-5, 1e5, num=20)) #  uniformly distributed in log space
plt.plot(np.linspace(1e-5, 1e5, num=20)) # uniformly distributed in linear space, instead of log space

In [None]:
C_List = np.geomspace(1e-5, 1e5, num = 20)
CA = []
Logarithmic_Loss = []
for c in C_List:
    log_reg2 = LogisticRegression(random_state=10, solver = 'lbfgs', C=c)
    log_reg2.fit(X_train, y_train)
    score = log_reg2.score(X_test, y_test)
    CA.append(score)
    print("The CA of C parameter {} is {}:".format(c, score))
    pred_proba_t = log_reg2.predict_proba(X_test)
    log_loss2 = log_loss(y_test, pred_proba_t)
    Logarithmic_Loss.append(log_loss2)
    print("The Logg Loss of C parameter {} is {}:".format(c, log_loss2))
    print("")

In [None]:
CA2 = np.array(CA).reshape(20,)
Logarithmic_Loss2 = np.array(Logarithmic_Loss).reshape(20,)

# zip
outcomes = zip(C_List, CA2, Logarithmic_Loss2)

#df
df_outcomes = pd.DataFrame(outcomes, columns = ["C_List", 'CA2','Logarithmic_Loss2'])

#print
df_outcomes

# Ordering the data (sort_values)
df_outcomes.sort_values("Logarithmic_Loss2", ascending = True).reset_index()

Из датафрейма выше, лучше выбирать параметры те, в которых Логарифмические потери меньше, тк это будет лучше для новых, невидимых параметров.


In [None]:
# Другой способ сделать вышеописанное
# Scikit-learn предлагает модуль LogisticRegressionCV, реализующий логистическую регрессию.
# со встроенной перекрестной проверкой для определения оптимального параметра C

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, random_state=0, shuffle=True)

# Logistic Reg CV
Log_reg3 = LogisticRegressionCV(random_state=15, Cs = C_List, solver ='lbfgs')
Log_reg3.fit(X_train, y_train)
print("The CA is:", Log_reg3.score(X_test, y_test))
pred_proba_t = Log_reg3.predict_proba(X_test)
log_loss3 = log_loss(y_test, pred_proba_t)
print("The Logistic Loss is: ", log_loss3)

print("The optimal C parameter is: ", Log_reg3.C_)

In [None]:
C_List = np.geomspace(1e-5, 1e5, num=20)
CA = []
Logarithmic_Loss = []

for c in C_List:
    log_reg2 = LogisticRegression(random_state=10, solver = 'lbfgs', C=c)
    log_reg2.fit(X_train, y_train)
    score = log_reg2.score(X_test, y_test)
    CA.append(score)
    print("The CA of C parameter {} is {}:".format(c, score))
    pred_proba_t = log_reg2.predict_proba(X_test)
    log_loss2 = log_loss(y_test, pred_proba_t)
    Logarithmic_Loss.append(log_loss2)
    print("The Logg Loss of C parameter {} is {}:".format(c, log_loss2))
    print("")
    
    y_pred = log_reg2.predict(X_train)
    cm = confusion_matrix(y_train, y_pred)
    cm_norm = cm / cm.sum(axis=1).reshape(-1,1)
    plot_confusion_matrix(cm_norm, classes = model.classes_, title='Confusion matrix')
    plt.show()
  

In [None]:
log_reg3 = LogisticRegression(random_state=10, solver = 'lbfgs', C=0.00011288)
log_reg3.fit(X_train, y_train)
score = log_reg3.score(X_test, y_test)

pred_proba_t = log_reg3.predict_proba(X_test)
log_loss2 = log_loss(y_test, pred_proba_t)

print("Testing Acc:", score)
print("Log Loss:", log_loss2)

***Рандомные деревья***

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(criterion='entropy', n_estimators=20)

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf_prediction_test = rf.predict(X_test)

In [None]:
print('The training accuracy is ', rf.score(X_train, y_train))
print('The testing accuracy is ', rf.score(X_test, y_test))

In [None]:
cm = confusion_matrix(y_test, rf_prediction_test)
cm_norm = cm / cm.sum(axis = 1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes = rf.classes_)

In [None]:
from itertools import product

n_estimators = 100
max_features = [1, 'sqrt', 'log2']
max_depth = [None, 2, 3, 4, 5]
for f,d in product(max_features, max_depth):
  rf = RandomForestClassifier(n_estimators = n_estimators,
                              max_features = f,
                              max_depth = d,
                              criterion = 'entropy', 
                              random_state = 120, 
                              n_jobs = 2)
  rf.fit(X_train, y_train)
  rf_prediction_test = rf.predict(X_test)
  print('Classification accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f,d, accuracy_score(y_test, rf_prediction_test)))
  cm = confusion_matrix(y_test, rf_prediction_test)
  cm_norm = cm / cm.sum(axis=1)[:, np.newaxis]
  plt.figure(figsize = (12,9))
  plot_confusion_matrix(cm_norm, classes = rf.classes_,
  title='Confusion matrix accuracy on test set with max features = {} and max_depth = {}: {:.3f}'.format(f, d, accuracy_score(y_test,rf_prediction_test)))


In [None]:
 #%pip install https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/master/xgboost-1.6.0.dev0%2B1d468e20a4fff83f3149e99371b67e6b31f64152-py3-none-manylinux2014_x86_64.whl


In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost

In [None]:
classifier = xgboost.XGBClassifier(tree_method='gpu_hist')

params = {
          "learning_rate":[0.05,0.10,0.15,0.20,0.25,0.30],
          "max_depth":[2,3,4,5,6,8,10,12,15],
          "min_child_weight":[1,3,5,7],
          "gamma":[0.0,0.1,0.2,0.3,0.4],
          "colsample_bytree":[0.3,0.4,0.5,0.7]
}

In [None]:
clf = RandomizedSearchCV(classifier, param_distributions = params, n_iter = 5, cv = 5, verbose=2, scoring = 'roc_auc')

In [None]:
clf.fit(X,y)

In [None]:
clf.best_estimator_

In [None]:
clf.best_params_

In [None]:
final_model = xgboost.XGBClassifier(colsample_bytree=0.5, gamma=0.1, max_depth=12, min_child_weight=5,
              tree_method='gpu_hist')

In [None]:
final_model.fit(X, y)

In [None]:
pred_xgboost = final_model.predict(X)

In [None]:
cm = confusion_matrix(y, pred_xgboost)
cm_norm = cm/cm.sum(axis=1)[:, np.newaxis]
plt.figure()
plot_confusion_matrix(cm_norm, classes=rf.classes_)

In [None]:
print('The testing accuracy is ', final_model.score(X, y))