In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn import metrics

import matplotlib.pyplot as plt
import altair as alt


In [None]:
ift_data = pd.read_excel('data/ift_data.xlsx')

In [None]:
# let's explore the data
ift_data.head()

In [None]:
# quick glmipse into the number of rows
ift_data.info

In [None]:
plt.scatter(ift_data['Water_content'],ift_data['IFT']);

In [None]:
plt.scatter(ift_data['time_minutes'],ift_data['IFT']);

In [None]:
alt.Chart(ift_data, title = 'Change in IFT with water content over time for CH4 and CO2').mark_circle(size=60).encode(
    alt.X('time_minutes:Q', title = 'Time'),
    alt.Y('IFT:Q'),
    alt.Color('Gas:N'),
).interactive()

### Modeling

In [None]:
x= ift_data.iloc[:,:3] # get x
y = ift_data.iloc[:,4] # get y
# splitting the data into train and test model
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, shuffle=True)
# since we have numeric and categorical features we will create a column transformer to transform them seperately

# first transformer for the numeric features
numeric_features = ['Water_content', 'time_minutes']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
# now a taransformer for the categorical features
categorical_features = ['Gas']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
# creating a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

ridge_model = Ridge()
# include the preprocessor and the model in one pipeline.
# Now we have a full prediction pipeline.
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('Regressor', ridge_model)])

# finally we will pass the pipe line to gridsearchcv to find the optimum paramters for the model
param_grid = {
    'Regressor__alpha':[0.1,0.25,0.4],
}
search = GridSearchCV(reg_pipeline,param_grid,cv = 5)

# fitting the model
search.fit(X_train, y_train)

# printing the first parameter
print(search.best_params_)
print("model score: %.3f" % search.score(X_test, y_test))


In [None]:
# let's look at he model paramters
model_intercept = search.best_estimator_['Regressor'].intercept_
model_intercept

In [None]:
model_slopes = search.best_estimator_['Regressor'].coef_
model_slopes


In [None]:
coeff_parameter = pd.DataFrame(search.best_estimator_['Regressor'].coef_,columns=['Coefficient'])
coeff_parameter

In [None]:
# let's evaluate the model peroformance using MSE and MAE

In [None]:
y_pred = search.predict(X_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

### Model Selection : Trying Gradient boosting

In [None]:
gb_model = GradientBoostingRegressor()
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('Regressor', gb_model)])
param_grid = {
    'Regressor__learning_rate':[0.1,0.25,0.4],
}
search = GridSearchCV(reg_pipeline, param_grid, n_jobs=-1)

search.fit(X_train, y_train)
search.best_params_
print("model score: %.3f" % search.score(X_test, y_test))
#reg.get_params().keys()

In [None]:
y_pred = search.predict(X_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


In [None]:
X_test

In [None]:
y_test

In [None]:
new_data = X_test.iloc[[0]]
new_data

In [None]:
search.predict(new_data)

In [None]:
new_data.iloc[0,1] = 0.7

In [4]:
l = ['CO2',0.7,195]
ll = []
ll.append(l)
print(ll)
new_data = pd.DataFrame(ll, columns = ['Gas','Water_content','time_minutes'])
#new_data 

[['CO2', 0.7, 195]]


In [None]:
search.predict(new_data)

In [None]:
## saving thee model : 

In [2]:
from joblib import dump, load


dump(search.best_estimator_, 'model.pkl')

NameError: name 'search' is not defined

In [5]:
model1 = load('model.pkl')

model1.predict(new_data)

array([19.93307007])

In [None]:
l = [1.2,1,1]

l[0] = int(l[0])

In [None]:
l