# Regression Techniques

Developed by: David

---

A set of tools for analyzing and predicting continuous data

---

# Import the Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Data Preprocessing

## Import the dataset

In [None]:
dataset = pd.read_csv('data/worker_productivity.csv')
X = dataset.iloc[:, 1:-1].values #Ignore the Date column
y = dataset.iloc[:, -1].values

In [None]:
print(X)

In [None]:
print(y)

## Handle the missing data tin the WIP column/feature

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 6:7])
X[:, 6:7] = imputer.transform(X[:, 6:7])

In [None]:
print(X)

## Encode the categorical variables quarter,department and day

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Define a ColumnTransformer to apply OneHotEncoder to the first three columns
ct = ColumnTransformer(
    transformers=[
        ('quarter_encoder', OneHotEncoder(), [0]),  # Encode the quarter variable (originally at index 0)
        ('department_encoder', OneHotEncoder(), [1]),  # Encode the department variable (originally at index 1)
        ('day_encoder', OneHotEncoder(), [2])  # Encode the day variable (originally at index 2)
    ],
    remainder='passthrough'  # Keep the rest of the columns as they are
)

# Apply the ColumnTransformer to X
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
print(y_train)

In [None]:
print(y_test)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
print(X_train)

In [None]:
print(X_test)

# Multiple Linear Regression:

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import r2_score
mlr_score = r2_score(y_test, y_pred)

# Polynomial Regression:

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)

In [None]:
y_pred = regressor.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import r2_score
poly_score = r2_score(y_test, y_pred)

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import r2_score
dtr_score = r2_score(y_test, y_pred)

# Random Forest Regression:

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import r2_score
rfr_score = r2_score(y_test, y_pred)

# Support Vector Regression (SVR)

In [None]:
y = y.reshape(len(y),1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

In [None]:
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)).reshape(-1,1))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import r2_score
svr_score = r2_score(y_test, y_pred)

# Model Selection via the R-Squared Mean

In [None]:
print('Multiple Linear Regression : ',mlr_score)
print('Polynomial Regression : ',poly_score)
print('Decision Tree Regression : ',dtr_score)
print('Random Forest Regression : ',rfr_score)
print('Support Vector Regression : ',svr_score)