# Introduction to Machine Learning

In [1]:
# Import libararies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
# Matplotlib settings
%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [3]:
# Load the dataset
df = pd.read_csv('transactions_clean.csv')
df.head(5)

Unnamed: 0,customer_id,name,region,quarter,number_of_purchases,total_transaction_amount
0,CUST00001,Alice Johnson,North,Q1-2025,11,10500
1,CUST00001,Alice Johnson,North,Q2-2025,13,12000
2,CUST00002,Bob Smith,South,Q1-2025,7,6500
3,CUST00002,Bob Smith,South,Q2-2025,9,8000
4,CUST00003,Carol White,North,Q1-2025,13,11500


In [4]:
# Encode categorical features

le_region = LabelEncoder()
le_quarter = LabelEncoder()
df['region_encoded'] = le_region.fit_transform(df['region'].fillna('Unknown'))
df['quarter_encoded'] = le_quarter.fit_transform(df['quarter'].fillna('Unknown'))

target = 'total_transaction_amount'
features = ['region_encoded', 'quarter_encoded', 'number_of_purchases']

data = df[features + [target]].dropna()

X = data[features]
y = data[target]

print(f"Dateset loaded. Samples: {len(data)}")

Dateset loaded. Samples: 600


## 1. Feature Scaling
We use StandardScalar

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=features)
print("Scaled Data Example:")
X_train_scaled_df.head(3)

Scaled Data Example:


Unnamed: 0,region_encoded,quarter_encoded,number_of_purchases
0,-1.008368,0.98347,1.36781
1,-1.008368,0.98347,0.880575
2,0.991701,0.98347,-1.311981


## 2. Baselind Model: Linear Regression

In [6]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)

print("----------Baseline Linear Regression Results----------")
print(f"\nMean Squared Error: {mse_lr:.4f}")
print(f"\nR^2 Score: {r2_lr:.4f}")
print(f"\nMean Absolute Error: {mae_lr}")

----------Baseline Linear Regression Results----------

Mean Squared Error: 0.0000

R^2 Score: 1.0000

Mean Absolute Error: 8.715990891990563e-13
