In [None]:
import pandas as pd
import matplotlib.pyplot as plt
mpa = pd.read_csv('~/Downloads/mpa.csv') #Load File

mpa.head()#View Top 5 Observations

In [None]:
mpa.shape #Total Rows and Columns

In [None]:
mpa.info() #Data Type

In [None]:
mpa.describe() #Statistical Summary

In [None]:
#EXPLORATORY DATA ANALYSIS
#Histogram for some independent variables
bfsa = mpa[[ 'Cement', 'Water', 'Superplasticizer', 'CA', 'FA', 'Age']];
bfsa.hist(figsize=(14,14) )
plt.autoscale()

In [None]:
#Scatter Plot Relationship between Cement and Concrete compressive strength, Water and Cement  
import matplotlib.pyplot as plt
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.scatter(mpa['Cement'], mpa['CCS'])
plt.ylabel('CCS')
plt.xlabel('Cement')
#Shows strong and positive relationship
plt.subplot(1,2,2)
plt.scatter(mpa['Water'], mpa['CCS'])
plt.ylabel('CCS')
plt.xlabel('Water')

plt.show()

In [None]:
#Scatter Plot Relationship between Coarse Aggregate and Fine Aggregate, Age and Concrete compressive strength.
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.scatter(mpa['CA'], mpa['FA'])
plt.ylabel('FA')
plt.xlabel('CA')


plt.subplot(1,2,2)
plt.scatter(mpa['Age'], mpa['CCS'])
plt.ylabel('CCS')
plt.xlabel('Age')

plt.show()

In [None]:
import seaborn as sns
sns.distplot(mpa['Age'], bins = 100) #Age Distribution
plt.autoscale()

In [None]:
#Correlation Map
corr = mpa.drop('CCS', axis=1).corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, vmax=1.0, square=True, annot=True, cmap='viridis')
mpa.autolayout : True
plt.title('CORRELATION BETWEEN FEATURES')

plt.autoscale()
display(corr)

In [None]:
#DATA PREPROCESSING
print (mpa.isnull().sum()) #Check for missing values

In [None]:
#Bin continuous data into HPCCS 
row_index1=mpa[(mpa['Age'] == 1) & (mpa['CCS']>10.4)].index
mpa.loc[row_index1, 'HPCCS'] = "Yes"
row_index1=mpa[(mpa['Age'] == 3) & (mpa['CCS']>=26)].index
mpa.loc[row_index1, 'HPCCS'] = "Yes"
row_index1=mpa[(mpa['Age'] == 7) & (mpa['CCS']>=42.25)].index
mpa.loc[row_index1, 'HPCCS'] = "Yes"
row_index1=mpa[(mpa['Age'] == 14) & (mpa['CCS']>=58.5)].index
mpa.loc[row_index1, 'HPCCS'] = "Yes"
row_index1=mpa[(mpa['Age'] == 28) & (mpa['CCS']>=63.7)].index
mpa.loc[row_index1, 'HPCCS'] = "Yes"
row_index1=mpa[(mpa['Age'] > 28) & (mpa['CCS']>=65)].index
mpa.loc[row_index1, 'HPCCS'] = "Yes"
#mpa.loc[mpa['Cat'] == 'nan', 'Cat'] = 0
mpa['HPCCS'] = mpa['HPCCS'].fillna("No")
mpa.head()

In [None]:
mpa = mpa.drop(mpa.columns[8],axis=1)
mpa.dtypes

In [None]:
X = mpa.iloc[:, 0:8].values
Y = mpa.iloc[:, -1]

In [None]:
plot_sns = sns.countplot(Y, label='Total')
Yes, No = Y.value_counts()
plt.title('HPC COMPRESSIVE STRENGTH')

In [None]:
fig, ax=plt.subplots(figsize=(18,10))
plt.subplot(2,3,1)
sns.countplot(x='Age', data=mpa, hue='HPCCS')
plt.title("Impact of Age on HPC Compressive Strength")
plt.show()

fig, ax=plt.subplots(figsize=(18,10))
plt.autoscale()
plt.subplot(2,3,2)
sns.countplot(x='Cement', data=mpa, hue='HPCCS')
ax.set_ylim(0,1)
plt.title("Impact of Cement on HPC Compressive Strength")


plt.show()

In [None]:
#HIGH DATA IMBALANCE. 
#SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res=sm.fit_sample(X,Y)
X_res.shape, y_res.shape


In [None]:
from collections import Counter
print('Original dataset shape{}'.format(Counter(Y)))
print('Resampled dataset shape{}'.format(Counter(y_res)))

In [None]:
#VIEW BALANCED SAMPLE
sns.countplot(y_res, label='Total')
Yes, No = Y.value_counts()
plt.title('HPC COMPRESSIVE STRENGTH')
print('No: ',Yes)
print('Yes : ',No)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [None]:
# Split your data into training and test set for the BALANCED SAMPLE
X_res_train, X_res_test, y_res_train, y_res_test = train_test_split(X_res, y_res, test_size=0.3, random_state=0)

# Split your data into training and test set for the UNBALANCED SAMPLE
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [None]:
#LOGISTIC REGRESSION
#BALANCED SAMPLE
from sklearn.linear_model import LogisticRegression
logmodel=LogisticRegression()
logmodel.fit(X_res_train, y_res_train)
predicted = logmodel.predict(X_res_test)
print(f'Accuracy Score:\n{accuracy_score(y_res_test, predicted)}')
#AUC
probs = logmodel.predict_proba(X_res)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(y_res, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#IMBALANCED SAMPLE
logmodel=LogisticRegression()
logmodel.fit(X_train, Y_train)
predicted = logmodel.predict(X_test)
print(f'Accuracy Score:\n{accuracy_score(Y_test, predicted)}')
probs = logmodel.predict_proba(X)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(Y, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#RANDOM FOREST
#BALANCED SAMPLE
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=0)
model.fit(X_res_train, y_res_train)
predicted = model.predict(X_res_test)
print(f'Accuracy Score:\n{accuracy_score(y_res_test, predicted)}')
probs = model.predict_proba(X_res)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(y_res, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#IMBALANCED SAMPLE
#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=0)
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
print(f'Accuracy Score:\n{accuracy_score(Y_test, predicted)}')
probs = model.predict_proba(X)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(Y, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#ADABOOST
#BALANCED SAMPLE
from sklearn.ensemble import AdaBoostClassifier
classifier = AdaBoostClassifier()
classifier.fit(X_res_train, y_res_train)
predicted = classifier.predict(X_res_test)
print(f'Accuracy Score:\n{accuracy_score(y_res_test, predicted)}')
probs = classifier.predict_proba(X_res)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(y_res, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#IMBALANCED SAMPLE
classifier = AdaBoostClassifier()
classifier.fit(X_train, Y_train)
predicted = classifier.predict(X_test)
print(f'Accuracy Score:\n{accuracy_score(Y_test, predicted)}')
probs = model.predict_proba(X)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(Y, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#DECISION TREE  
#BALANCED SAMPLE
from sklearn.tree import DecisionTreeClassifier
runmodel = DecisionTreeClassifier()
runmodel.fit(X_res_train, y_res_train)
predicted = runmodel.predict(X_res_test)
print(f'Accuracy Score:\n{accuracy_score(y_res_test, predicted)}')
probs = runmodel.predict_proba(X_res)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(y_res, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#IMBALANCED SAMPLE
runmodel = DecisionTreeClassifier()
runmodel.fit(X_train, Y_train)
predicted = runmodel.predict(X_test)
print(f'Accuracy Score:\n{accuracy_score(Y_test, predicted)}')
probs = runmodel.predict_proba(X)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(Y, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#GRADIENT BOOST
#BALANCED SAMPLE
from sklearn.ensemble import GradientBoostingClassifier
gbmodel = GradientBoostingClassifier()
gbmodel.fit(X_res_train, y_res_train)
predicted = gbmodel.predict(X_res_test)
print(f'Accuracy Score:\n{accuracy_score(y_res_test, predicted)}')
probs = gbmodel.predict_proba(X_res)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(y_res, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#IMBALANCED SAMPLE
gbmodel = GradientBoostingClassifier()
gbmodel.fit(X_train, Y_train)
predicted = gbmodel.predict(X_test)
print(f'Accuracy Score:\n{accuracy_score(Y_test, predicted)}')
probs = gbmodel.predict_proba(X)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(Y, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
roc_auc
print('AUC: \n%.4f' % roc_auc)

In [None]:
#XGBOOST
#BALANCED SAMPLE
from sklearn.model_selection import RandomizedSearchCV
import xgboost
xgclassifier=xgboost.XGBClassifier()

In [None]:
#Hyper parameter optimisation

params={
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7]
}

In [None]:
random_search=RandomizedSearchCV(xgclassifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=10,verbose=3)

In [None]:
random_search.fit(X_res_train, y_res_train)

In [None]:
random_search.best_estimator_

In [None]:
xgclassifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.4,
              learning_rate=0.15, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
xgclassifier.fit(X_res_train, y_res_train)
predicted = xgclassifier.predict(X_res_test)
print(f'Accuracy Score:\n{accuracy_score(y_res_test, predicted)}')
probs = xgclassifier.predict_proba(X_res)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(y_res, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#IMBALANCED
random_search.fit(X_train, Y_train)

In [None]:
random_search.best_estimator_

In [None]:
xgclassifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.3,
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
xgclassifier.fit(X_train, Y_train)
predicted = xgclassifier.predict(X_test)
print(f'Accuracy Score:\n{accuracy_score(Y_test, predicted)}')
probs = xgclassifier.predict_proba(X)
preds = probs[:,-1]
fpr, tpr, threshold = metrics.roc_curve(Y, preds, pos_label = 'Yes')
roc_auc = metrics.auc(fpr, tpr)
print('AUC: \n%.4f' % roc_auc)

In [None]:
#TREE BASED FEATURE SELECTION

In [None]:
rf = RandomForestClassifier(n_jobs=-1, random_state=0)
rf.fit(X_train, Y_train)
feat_imp = rf.feature_importances_

In [None]:
feat_imp