# **ML Regression 1st Project Project**

# **Data Load from external data source (Git .csv file)**

In [None]:
import pandas as pd

dtframe = pd.read_csv('https://raw.githubusercontent.com/clebervisconti/datasets/main/sales_data_sample.csv', usecols=['QUANTITYORDERED','PRICEEACH','SALES','MONTH_ID','YEAR_ID'])
dtframe

# **Data Transformation**

## Determining Y and X

In [None]:
y = dtframe['QUANTITYORDERED']
y

In [None]:
X = dtframe.drop('QUANTITYORDERED', axis=1)
X

## Data Transformation and Spliting

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100)

In [None]:
X_train

In [None]:
X_test

# **Building Model: Linear Regression**

### **Training the model**

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

### **Applying the model to make a prediction**

In [None]:
y_lr_train_pred = lr.predict(X_train)
y_lr_test_pred = lr.predict(X_test)

In [None]:
y_lr_train_pred

In [None]:
y_lr_test_pred

### **Evaluate model performance**

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

lr_train_mse = mean_squared_error(y_train, y_lr_train_pred)
lr_train_r2 = r2_score(y_train, y_lr_train_pred)

lr_test_mse = mean_squared_error(y_test, y_lr_test_pred)
lr_test_r2 = r2_score(y_test, y_lr_test_pred)

In [None]:
print('LR MSE (Train): ', lr_train_mse)
print('LR R2 (Train): ', lr_train_r2)
print('LR MSE (Test): ', lr_test_mse)
print('LR R2 (Test): ', lr_test_r2)

In [None]:
lr_results = pd.DataFrame(['Linear regression', lr_train_mse, lr_train_r2, lr_test_mse, lr_test_r2]).transpose()
lr_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']

In [None]:
lr_results

## **Building Model: Random Forest**

### **Training the model**

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=2, random_state=100)
rf.fit(X_train, y_train)

### **Applying the model to make a prediction**

In [None]:
y_rf_train_pred = rf.predict(X_train)
y_rf_test_pred = rf.predict(X_test)

### **Evaluate model performance**

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred)

rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)

In [None]:
rf_results = pd.DataFrame(['Random forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']
rf_results

## **Models comparison**

In [None]:
df_models = pd.concat([lr_results, rf_results], axis=0)

In [None]:
df_models.reset_index(drop=True)

# **Matplot: Data visualization of prediction results**

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(5,5))
plt.scatter(x=y_train, y=y_lr_train_pred, c="#7CAE00" ,alpha=0.3)

z = np.polyfit(y_train, y_lr_train_pred, 1)
p = np.poly1d(z)

plt.plot(y_train, p(y_train), '#F8766D')
plt.ylabel('Predict LogS')
plt.xlabel('Experimental LogS')