# Introduction to Machine Learning

In [3]:
# Import libararies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [4]:
# Matplotlib settings
%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [6]:
# Load the dataset
df = pd.read_csv('transactions.csv')
df.head(5)

Unnamed: 0,customer_id,name,region,quarter,number_of_purchases,total_transaction_amount
0,CUST00001,David Schwartz,Kelantan,Q1-2025,24.0,14762.51
1,CUST00001,David Schwartz,Kelantan,Q2-2025,21.0,5744.74
2,CUST00001,David Schwartz,Kelantan,Q3-2025,14.0,14654.86
3,CUST00001,,Kelantan,Q4-2025,23.0,991.57
4,CUST00002,Michelle Perez,Negeri Sembilan,Q1-2025,11.0,1558.18


In [7]:
# Encode categorical features

le_region = LabelEncoder()
le_quarter = LabelEncoder()
df['region_encoded'] = le_region.fit_transform(df['region'].fillna('Unknown'))
df['quarter_encoded'] = le_quarter.fit_transform(df['quarter'].fillna('Unknown'))

target = 'total_transaction_amount'
features = ['region_encoded', 'quarter_encoded', 'number_of_purchases']

data = df[features + [target]].dropna()

X = data[features]
y = data[target]

print(f"Dateset loaded. Samples: {len(data)}")

Dateset loaded. Samples: 1088


## 1. Feature Scaling
We use StandardScalar

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=features)
print("Scaled Data Example:")
X_train_scaled_df.head(3)

Scaled Data Example:


Unnamed: 0,region_encoded,quarter_encoded,number_of_purchases
0,1.063195,-1.353472,-1.055985
1,-0.107171,0.447018,-0.530108
2,0.868134,1.347263,0.653115


## 2. Baselind Model: Linear Regression

In [10]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)

print("----------Baseline Linear Regression Results----------")
print(f"\nMean Squared Error: {mse_lr:.4f}")
print(f"\nR^2 Score: {r2_lr:.4f}")
print(f"\nMean Absolute Error: {mae_lr}")

----------Baseline Linear Regression Results----------

Mean Squared Error: 17062002.1386

R^2 Score: 0.0019

Mean Absolute Error: 3475.89070074514
