<a href="https://colab.research.google.com/github/crisbpadilla/CapstoneProject-1-M2M/blob/main/CapstoneProject_2_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
import numpy as np

concrete=pd.read_csv('concrete.csv')

In [20]:
from bokeh.plotting import figure, output_notebook, show, output_notebook
from bokeh.models import ColumnDataSource, LinearColorMapper,ColorBar

output_notebook()
material_corr=concrete.corr()

material_corr = material_corr.stack().reset_index()
material_corr.columns = ['x', 'y', 'value']

material_corr['value'] = material_corr['value'].round(2)

source = ColumnDataSource(material_corr)

# Define color mapper
color_mapper = LinearColorMapper(palette="Inferno256", low=-1, high=1)

# Create figure
p = figure(title="Correlation Heatmap", x_range=list(concrete.columns), y_range=list(concrete.columns),
           x_axis_location="above", width=600, height=600,
           tools="hover", tooltips="@x, @y: @value{0.2f}")

p.rect(x="x", y="y", width=1, height=1, source=source,
       fill_color={'field': 'value', 'transform': color_mapper}, line_color=None)

# Add text annotations
from bokeh.models import LabelSet
labels = LabelSet(x='x', y='y', text='value', source=source,text_align='center',
                  text_baseline='middle',text_font_size='8pt',text_color='white')
p.add_layout(labels)

# Add color bar
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')

p.xaxis.major_label_orientation = 0.75

# Show plot
show(p)


Reviewing the heatmap it is possible to observe the inverse correlation between water an superplastic, variable that could be flattened as they are both lightly correlated ( superplastic is used to speed up the compressive strength gain and the reduction of water in the mix).
 So we will check the performance of the model, reducing the demionsionality of this model , taking water and superplastic into one unique component.

# Preprocessing data

In [21]:
concrete_pluscat=concrete.copy()
# cuales son los escenarios? 9 (matriz 3x3) , la mitad menos 1 ~ 3
#a) slag-ash b)slag-superplastic c)ash-superplastic

concrete_pluscat['type_mix'] = 'all mix'
concrete_pluscat.loc[
    (concrete_pluscat['slag']>0) & (concrete_pluscat[['superplastic','ash']] == 0).all(axis=1),
    'type_mix',
] = 'slag'
concrete_pluscat.loc[
    (concrete_pluscat['ash']>0) & (concrete_pluscat[['superplastic','slag']] == 0).all(axis=1),
    'type_mix',
] = 'ash'
concrete_pluscat.loc[
    (concrete_pluscat['superplastic']>0) & (concrete_pluscat[['slag','ash']] == 0).all(axis=1),
    'type_mix',
] = 'superplastic'

concrete_pluscat.loc[
    (concrete_pluscat[['slag', 'ash']].gt(0).all(axis=1)) & (concrete_pluscat['superplastic'] == 0),
    'type_mix',
] = 'slag-ash'

concrete_pluscat.loc[
    (concrete_pluscat[['slag', 'superplastic']].gt(0).all(axis=1)) & (concrete_pluscat['ash'] == 0),
    'type_mix',
] = 'slag-superplastic'

concrete_pluscat.loc[
    (concrete_pluscat[['ash', 'superplastic']].gt(0).all(axis=1)) & (concrete_pluscat['slag'] == 0),
    'type_mix',
] = 'ash-superplastic'
concrete_pluscat.loc[
    (concrete_pluscat['cement']>0) & (concrete_pluscat[['superplastic','slag','ash']] == 0).all(axis=1),
    'type_mix',
] = 'just cement'

In [22]:
# Create the dataframe with all the numerical and categorical values
concrete_data=pd.get_dummies(data=concrete_pluscat,columns=['type_mix'],drop_first=True).astype(int)
concrete_data.head()


Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength,type_mix_ash,type_mix_ash-superplastic,type_mix_just cement,type_mix_slag,type_mix_slag-superplastic,type_mix_superplastic
0,141,212,0,203,0,971,748,28,29,0,0,0,1,0,0
1,168,42,124,158,10,1080,796,14,23,0,0,0,0,0,0
2,250,0,95,187,5,956,861,28,29,0,1,0,0,0,0
3,266,114,0,228,0,932,670,28,45,0,0,0,1,0,0
4,154,183,0,193,9,1047,696,28,18,0,0,0,0,1,0


In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

X=concrete_data.drop(columns=['strength'])
y=concrete_data['strength']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)


In [24]:
# Seprating my numeric variables and nominal/categorical values
num_col=[a for a in concrete.columns]
num_col.remove('strength')
#numerical values
X_train_num=X_train[num_col]
X_test_num=X_test[num_col]
#nominal values
X_train_nom=X_train.drop(columns=num_col)
X_test_nom=X_test.drop(columns=num_col)

##Scalling the data

In [25]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scale_data=StandardScaler()
print(pd.DataFrame(scale_data.fit_transform(X_train_num)))

X_train_num_scale=scale_data.fit_transform(X_train_num)
X_test_num_scale=scale_data.fit_transform(X_test_num)


            0         1         2         3         4         5         6  \
0   -1.343182  1.627717 -0.837718  0.972573 -1.028085 -0.004291 -0.320256   
1   -0.274105 -0.871661  0.992631  0.297612 -0.181582  0.744919 -0.210622   
2    1.844611  0.507713 -0.837718 -1.997254  4.389537 -1.568430  0.800451   
3    0.328466  1.026452 -0.837718 -0.152362  0.664922 -0.714068 -0.186258   
4   -0.866957  1.368348 -0.837718  2.097507 -1.028085 -0.516908 -1.270420   
..        ...       ...       ...       ...       ...       ...       ...   
767  0.688064 -0.871661 -0.837718  0.207617 -1.028085  1.034088 -0.052261   
768 -0.021414 -0.871661 -0.837718  0.432604 -1.028085 -0.043723  0.995356   
769  0.688064 -0.871661 -0.837718  0.972573 -1.028085  0.035142  0.008647   
770  0.503405 -0.871661 -0.837718  0.477602 -1.028085  0.087718  0.617727   
771  1.621077 -0.588712  0.387685 -0.872320  0.664922 -0.056867 -0.758793   

            7  
0    0.653619  
1    0.134458  
2   -0.613744  
3   -0.2930

In [26]:
from sklearn import linear_model
lineal=linear_model.LinearRegression()
lineal.fit(X_train_num_scale,y_train)
print(lineal.score(X_train_num_scale,y_train))
print(lineal.score(X_test_num_scale,y_test))

0.6324271390119776
0.5682664251510813


In [49]:
#lineal regession gives a r2 too low and the function seems to fit to a log function
from sklearn.preprocessing import PolynomialFeatures,FunctionTransformer

from sklearn.pipeline import make_pipeline

cubic = PolynomialFeatures(degree=3, include_bias=False)
X_train_scale_cubic = cubic.fit_transform(X_train_num_scale)
X_test_scale_cubic = cubic.fit_transform(X_test_num_scale)

reg=linear_model.LinearRegression()
reg.fit(X_train_scale_cubic,y_train)
prediction_train=reg.predict(X_train_scale_cubic)
prediction_test=reg.predict(X_test_scale_cubic)

print(reg.score(X_train_scale_cubic,y_train))
print(reg.score(X_test_scale_cubic,y_test))


0.932563353375674
0.6485457260518314


In [29]:


X_train_ln_full = np.hstack([X_train_scale_cubic, X_train_nom])
X_test_ln_full = np.hstack([X_test_scale_cubic, X_test_nom])

X_train_ln_full.shape

(772, 170)

In [30]:
ln_model= linear_model.RidgeCV()
ln_model.fit(X_train_ln_full,y_train)
print(ln_model.score(X_train_ln_full,y_train))
print(ln_model.score(X_test_ln_full,y_test))

0.9293723109216393
0.7305025796580165


## applying PCA

In [54]:
X_train_WS=X_train[['water','superplastic']]
X_train_WS_scaled= scale_data.fit_transform(X_train_WS)
X_test_WS=X_test[['water','superplastic']]
X_test_WS_scaled= scale_data.fit_transform(X_test_WS)


pca=PCA(n_components=1)
X_train_pca=pca.fit_transform(X_train_WS_scaled)
X_test_pca=pca.fit_transform(X_test_WS_scaled)
# train and test data without water and superplastic

col=[a for a in concrete.columns]
items_to_remove = ['strength', 'water', 'superplastic']
col = [c for c in col if c not in items_to_remove]

X_train_non_PCA=X_train[col]
X_test_non_PCA=X_test[col]
#scaled data
X_train_non_PCA_scale=scale_data.fit_transform(X_train_non_PCA)
X_test_non_PCA_scale=scale_data.fit_transform(X_test_non_PCA)

X_train_PCA_scaled=np.hstack([X_train_pca,X_train_non_PCA_scale])
X_test_PCA_scaled=np.hstack([X_test_pca,X_test_non_PCA_scale])


In [55]:
cubic = PolynomialFeatures(degree=3, include_bias=False)
X_train_scale_cubic = cubic.fit_transform(X_train_PCA_scaled)
X_test_scale_cubic = cubic.fit_transform(X_test_PCA_scaled)

reg=linear_model.LinearRegression()
reg.fit(X_train_scale_cubic,y_train)
prediction_train=reg.predict(X_train_scale_cubic)
prediction_test=reg.predict(X_test_scale_cubic)

print(reg.score(X_train_scale_cubic,y_train))
print(reg.score(X_test_scale_cubic,y_test))

0.91064058115028
0.7029438261896024


In this case the R^2 for the train data was reduced but the R^2 for the test data increased in relation with the previous model.
 So, now left to add the nominal variabes to verify the score with this model.

In [56]:

X_train_ln_full = np.hstack([X_train_scale_cubic, X_train_nom])
X_test_ln_full = np.hstack([X_test_scale_cubic, X_test_nom])


In [57]:
ln_model= linear_model.RidgeCV()
ln_model.fit(X_train_ln_full,y_train)
print(ln_model.score(X_train_ln_full,y_train))
print(ln_model.score(X_test_ln_full,y_test))

0.8968277995373203
0.7804836779494999
