<a href="https://colab.research.google.com/github/cisimon7/Machine-Learning-with-plotly/blob/main/regression/ML_Regression_with_Plotly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>ML Regression with Plotly</h>

In [57]:
import pandas as pd
import plotly.express as px

In [35]:
df = px.data.tips()
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


#### <b>Basic Linear Regression</b>

##### <i>Linear fit trendlines with Plotly Express<i/>

In [7]:
"""
A simple trendline is added using the parameter trendline. 
ols represents 'Ordinary Least Square'

Other available methods include: 
"""
fig = px.scatter(
    df, x='total_bill', y='tip', opacity=0.65, 
    trendline='ols', trendline_color_override='darkblue'
)
fig.show()


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



##### <i>Fitting multiple lines and retrieving the model parameters<i/>

In [23]:
"""
facet_col : creates different graphs for each possible values in the column
color : separets the data in a graph into two and analysis performed on each value type
"""
fig = px.scatter(
    df, x='total_bill', y='tip', facet_col='smoker', 
    color='sex', trendline='ols'
)
fig.show()

# results = px.get_trendline_results(fig)
# print(results)

# results.query("sex=='Male' and smoker=='Yes'")\
#        .px_fit_results.iloc[0]\
#        .summary()

##### <i>Non-Linear Trendlines<i/>

In [15]:
df = px.data.gapminder().query("year==2007")
fig=px.scatter(df, x="gdpPercap", y="lifeExp", 
               color="continent", trendline="lowess")
fig.show()

In [1]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

In [20]:
df = px.data.tips()
X = df.total_bill.values.reshape(-1,1)

model = LinearRegression()
model.fit(X, df.tip)

x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1,1))

fig = px.scatter(df, x="total_bill", y="tip", opacity=0.65)
fig.add_traces(go.Scatter(x=x_range, y=y_range, name="Regression Fit"))
fig.show()

In [22]:
df = px.data.tips()
X = df.total_bill[:, None]
X_train, X_test, y_train, y_test = train_test_split(X, df.tip, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))

"""A line, in a way, is no different from a scattered plot"""
fig = go.Figure([
    go.Scatter(x=X_train.squeeze(), y=y_train, name='train', mode='markers'),
    go.Scatter(x=X_test.squeeze(), y=y_test, name='test', mode='markers'),
    go.Scatter(x=x_range, y=y_range, name='prediction')
])
fig.show()


Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.



### <b>Polynomial Regression<b/>

In [25]:
def format_coefs(coefs):
  equation_list = [f"{coef}x^{i}" for i, coef in enumerate(coefs)]
  equation = "$" + " + ".join(equation_list) + "$"

  replace_map = {"x^0":"", "x^1":"x", "+-":"-"}
  for old,new in replace_map.items():
    equation = equation.replace(old, new)

  return equation

In [34]:
df = px.data.tips()
X = df.total_bill.values.reshape(-1,1)
x_range = np.linspace(X.min(), X.max(), 100).reshape(-1,1)

fig = px.scatter(df, x='total_bill', y='tip', opacity=0.65)
for degree in [1,2,3,4]:
  poly = PolynomialFeatures(degree)
  poly.fit(X)
  X_poly = poly.transform(X)
  x_range_poly = poly.transform(x_range)

  model = LinearRegression(fit_intercept=False)
  model.fit(X_poly, df.tip)
  y_poly = model.predict(x_range_poly)

  equation = format_coefs(model.coef_.round(2))
  fig.add_traces(go.Scatter(x=x_range.squeeze(), y=y_poly, name=equation))

fig.show()

##### <i>Predicted Error Plots<i/>

In [35]:
df = px.data.iris()
X = df[['sepal_width', 'sepal_length']]
y = df['petal_width']

In [37]:
# Condition the model on sepal width and length, predict the petal width
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

In [39]:
fig = px.scatter(x=y, y=y_pred, labels={'x':'ground truth', 'y':'prediction'})
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=y.min(), y0=y.min(),
    x1=y.max(), y1=y.max()
)
fig.show()

In [97]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [124]:
from plotly.subplots import make_subplots

In [98]:
def true_fun(X):
    return np.cos(1.5 * np.pi * X)

In [99]:
x_max = 1

np.random.seed(0)

n_samples = 30
degrees = [1,4,15]

X = np.sort(np.random.rand(n_samples)) * x_max
y = true_fun(X) + np.random.randn(n_samples) * 0.1

In [139]:
for degree in degrees:
  polynomial_features = PolynomialFeatures(degree, include_bias=False)
  linear_regression = LinearRegression()
  pipeline = Pipeline([
      ('polynomial_features', polynomial_features),
      ('linear_regression',linear_regression)
  ])
  pipeline.fit(X.reshape(-1,1), y)

  X_test = np.linspace(0, x_max, 100)  
  fig = go.Figure()
  fig.add_scatter(x=X, y=true_fun(X), name="True function")
  fig.add_trace(go.Scatter(x=X_test, y=pipeline.predict(X_test.reshape(-1,1)), name="Model"))
  fig.add_scatter(x=X, y=y, name="Samples", mode='markers')
  fig.update_layout(yaxis=dict(range=[-2, 2]), width=700)
  fig.show()

### <b>Logistic Regression<b/>

In [73]:
def get_custom_fig(title="", width=1000):
  fig = go.Figure()
  fig.update_layout(title=title,width=width)
  fig.update_xaxes(showgrid=True) # gridcolor='rgba(1,1,1,0.3)'
  fig.update_yaxes(showgrid=True) # gridcolor='rgba(1,1,1,0.3)'

  return fig

##### <i>Sigmoid Function<i/>

In [30]:
x = np.arange(-10, 10, 0.01)

fig = get_custom_fig(title="Sigmoid Function") 

def plot(b0, b1):
    p = np.exp(b0 + b1 * x) / (1 + np.exp(b0 + b1 * x))
    label = "b0 = {}, b1 = {}".format(b0, b1)
    fig.add_traces(go.Scatter(x=x, y=p, name=label))

plot(0,1)
plot(0,2)
plot(0,3)
plot(5,1)

fig.show()

##### <i>Logistic Regression Lost Function<i/>

In [31]:
x = np.arange(0.001, 1, 0.001)
y1 = -np.log(x)
y0 = -np.log(1-x)

fig = get_custom_fig(title="Logistic Regression Loss Function", width=700)
fig.add_traces([
    go.Scatter(x=x,y=y1,name="if y = 1"),
    go.Scatter(x=x,y=y0,name="if y = 0")
])
fig.show()

In [36]:
df = px.data.tips()
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [41]:
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [60]:
encoder = OneHotEncoder(sparse=False, drop='first')
f_names = ['sex']
encoder.fit(df[f_names])
new_feats = encoder.transform(df[f_names])

# create dataframe from encoded features with named columns
new_cols = pd.DataFrame(new_feats, dtype=int, columns=encoder.get_feature_names(f_names))
new_df = pd.concat([df, new_cols], axis=1)    
new_df.drop(f_names, axis=1, inplace=True)
new_df

Unnamed: 0,total_bill,tip,smoker,day,time,size,sex_Male
0,16.99,1.01,No,Sun,Dinner,2,0
1,10.34,1.66,No,Sun,Dinner,3,1
2,21.01,3.50,No,Sun,Dinner,3,1
3,23.68,3.31,No,Sun,Dinner,2,1
4,24.59,3.61,No,Sun,Dinner,4,0
...,...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3,1
240,27.18,2.00,Yes,Sat,Dinner,2,0
241,22.67,2.00,Yes,Sat,Dinner,2,1
242,17.82,1.75,No,Sat,Dinner,2,1


In [63]:
# split data
x_train, x_test, y_train, y_test = train_test_split(new_df.loc[:, :'tip'], new_df['sex_Male'],
                                                    test_size=0.2, stratify=new_df['sex_Male'])

In [64]:
# feature scaling
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

In [67]:
model = LogisticRegression()
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [69]:
y_test_pred = model.predict(x_test)
print('Testing accuracy = {}'.format(metrics.accuracy_score(y_test, y_test_pred)))
print('Testing precision = {}'.format(metrics.precision_score(y_test, y_test_pred)))
print('Testing recall = {}'.format(metrics.recall_score(y_test, y_test_pred)))

Testing accuracy = 0.6530612244897959
Testing precision = 0.6530612244897959
Testing recall = 1.0


In [70]:
model.coef_

array([[ 0.81956078, -0.08999856]])

In [93]:
thresholds = np.linspace(0,1,20)
# calculate metrics for each threshold above and plot the result as below.

pred_proba = model.predict_proba(x_test)
results = [[],[],[]]
for i in thresholds:
    y_test_pred_thr = np.where(pred_proba[:, 1] > i, 1, 0)
    results[0].append(metrics.accuracy_score(y_test, y_test_pred_thr))
    results[1].append(metrics.precision_score(y_test, y_test_pred_thr))
    results[2].append(metrics.recall_score(y_test, y_test_pred_thr))

fig = go.Figure()
fig.add_traces([
    go.Scatter(x=thresholds, y=results[0], name="Accuracy", marker=dict(size=0)),
    go.Scatter(x=thresholds, y=results[1], name="Precision", marker=dict(size=0)),
    go.Scatter(x=thresholds, y=results[2], name="Recall", marker=dict(size=0))
])
fig.update_layout(title="Threshold Comparison") # plot_bgcolor='rgba(0,0,0,0)
fig.update_xaxes(title="Thresholds")
fig.update_yaxes(title="Metrics")
fig.show()


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



In [96]:
import plotly.figure_factory as ff

z = [[0.1, 0.3, 0.5, 0.2],
     [1.0, 0.8, 0.6, 0.1],
     [0.1, 0.3, 0.6, 0.9],
     [0.6, 0.4, 0.2, 0.2]]

x = ['healthy', 'multiple diseases', 'rust', 'scab']
y =  ['healthy', 'multiple diseases', 'rust', 'scab']

# change each element of z to type string for annotations
z_text = [[str(y) for y in x] for x in z]

# set up figure 
fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='thermal')

# add title
fig.update_layout(title_text='<i><b>Confusion matrix</b></i>',
                  #xaxis = dict(title='x'),
                  #yaxis = dict(title='x')
)
fig.show()