## Lesson 6
# Scikit Learn

## For Scikit Learn Model Selection:

https://scikit-learn.org/stable/api/sklearn.model_selection.html

## For Scikit Learn Feature Extraction:

https://scikit-learn.org/stable/modules/feature_extraction.html

## For Scikit Learn Different Vector Algorithms:

https://scikit-learn.org/stable/supervised_learning.html

In [3]:
# Import libraries

import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import RandomOverSampler

In [4]:
# Load the dataset

data_path = "./assets/railway_maintenance.json"
df = pd.read_json(data_path)
df.head(4)

Unnamed: 0,id,Date,Description,Price,Duration of maintenance
0,1,2023-09-09,Signal system inspection and repair,3130,3
1,2,2015-02-10,Overhead electrical line maintenance,20183,5
2,3,2015-06-24,Rail fastening and welding,29828,4
3,4,2022-01-26,Emergency repair due to track failure,5178,2


In [5]:
# Price categorization

def categorized_price(price):
    if price < 10000:
        return "cheap"
    elif 10001<= price <= 30000:
        return "ok"
    else:
        return "expensive"
df["Category"]=df["Price"].apply(categorized_price)
df.head(15)

Unnamed: 0,id,Date,Description,Price,Duration of maintenance,Category
0,1,2023-09-09,Signal system inspection and repair,3130,3,cheap
1,2,2015-02-10,Overhead electrical line maintenance,20183,5,ok
2,3,2015-06-24,Rail fastening and welding,29828,4,ok
3,4,2022-01-26,Emergency repair due to track failure,5178,2,cheap
4,5,2015-04-27,Grade crossing renewal,1281,1,cheap
5,6,2015-06-14,Grade crossing renewal,7978,3,cheap
6,7,2019-09-19,Railcar maintenance and refurbishing,27346,4,ok
7,8,2017-10-27,Track realignment and resurfacing,15818,5,ok
8,9,2018-07-27,Switch and turnout servicing,78489,17,expensive
9,10,2023-11-03,Signal system inspection and repair,7405,3,cheap


In [6]:
# Spliting data
X_train, X_test, y_train, y_test = train_test_split(df["Description"], df["Category"], test_size=0.2, random_state=42)

In [7]:
# Balance the dataset
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train.values.reshape(-1,1), y_train)
X_train_resampled = X_train_resampled.ravel()

In [8]:
# Vectrorize and create a pipeline:

vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_resampled)
X_test_vectorized = vectorizer.transform(X_test)

model = SVC(kernel='linear', random_state=42)
model.fit(X_train_vectorized, y_train_resampled)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [11]:
# Prediction and Evaluation

y_pred = model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Mean accuracy: {accuracy:.4f}")
print("===============================================")
print(f"F1 Score: {f1:.4f}")
print("===============================================")
print(classification_report(y_test, y_pred))

Mean accuracy: 0.2792
F1 Score: 0.2857
              precision    recall  f1-score   support

       cheap       0.47      0.19      0.27       603
   expensive       0.15      0.53      0.24       170
          ok       0.35      0.31      0.33       427

    accuracy                           0.28      1200
   macro avg       0.32      0.34      0.28      1200
weighted avg       0.38      0.28      0.29      1200



In [12]:
# Test the system
with open("./assets/model.pk1", "wb") as model_file:
    pickle.dump(model, model_file)
with open("./assets/vectorizer.pk1", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)
    

In [13]:
# Check an example:

def predict_category(description):
    with open("./assets/vectorizer.pk1", 'rb') as vec_file:
        vectorizer = pickle.load(vec_file)
    with open("./assets/model.pk1", "rb") as model_file:
        model = pickle.load(model_file)
    description_vectorization = vectorizer.transform([description])
    return model.predict(description_vectorization)[0]

In [16]:
# Test the model

test_prediction = "Emergency repair"
print(f"Predicted category: {predict_category(test_prediction)}")

Predicted category: expensive


## Excercise

Generate a dataset use AI or download a real world dataset from any source, then train, predict and test