In [None]:
from pathlib import Path
import csv

p = Path("../../data/irish.csv")

rows = []
data_section = False

with open(p, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()

        
        if not line:
            continue

        
        if line.startswith("%"):
            continue

        
        if line.lower().startswith("@data"):
            data_section = True
            continue

        
        if not data_section:
            continue

        
        rows.append(line)

print("Total data rows:", len(rows))
print("Example rows:", rows[:5])


Total data rows: 500
Example rows: ['male,113,Junior_cycle_incomplete-secondary_school,not_taken,28,secondary', 'male,101,Primary_terminal_leaver,not_taken,28,primary_terminal_leaver', 'male,110,Senior_cycle_terminal_leaver-secondary_school,taken,69,secondary', 'male,121,Junior_cycle_terminal_leaver-secondary_school,not_taken,57,secondary', 'male,82,Junior_cycle_terminal_leaver-vocational_school,not_taken,18,vocational']


In [21]:
parsed = [row.split(",") for row in rows]

print("Columns per row:", len(parsed[0]))
print("First parsed row:", parsed[0])


Columns per row: 6
First parsed row: ['male', '113', 'Junior_cycle_incomplete-secondary_school', 'not_taken', '28', 'secondary']


In [22]:
import pandas as pd

n_cols = len(parsed[0])
colnames = [f"col_{i}" for i in range(n_cols)]

df = pd.DataFrame(parsed, columns=colnames)

print(df.shape)
df.head()


(500, 6)


Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5
0,male,113,Junior_cycle_incomplete-secondary_school,not_taken,28,secondary
1,male,101,Primary_terminal_leaver,not_taken,28,primary_terminal_leaver
2,male,110,Senior_cycle_terminal_leaver-secondary_school,taken,69,secondary
3,male,121,Junior_cycle_terminal_leaver-secondary_school,not_taken,57,secondary
4,male,82,Junior_cycle_terminal_leaver-vocational_school,not_taken,18,vocational


In [23]:
df.columns = [
    "Sex",          # col_0
    "DVRT",         # col_1
    "Education",    # col_2
    "Course",       # col_3
    "Score",        # col_4
    "Outcome"       # if there is a col_5
]
print(df.head())


    Sex DVRT                                       Education     Course Score  \
0  male  113        Junior_cycle_incomplete-secondary_school  not_taken    28   
1  male  101                         Primary_terminal_leaver  not_taken    28   
2  male  110   Senior_cycle_terminal_leaver-secondary_school      taken    69   
3  male  121   Junior_cycle_terminal_leaver-secondary_school  not_taken    57   
4  male   82  Junior_cycle_terminal_leaver-vocational_school  not_taken    18   

                   Outcome  
0                secondary  
1  primary_terminal_leaver  
2                secondary  
3                secondary  
4               vocational  


In [24]:
target = "Course"
X = df.drop(columns=[target])
y = df[target]

print(X.shape, y.shape)
print(y.value_counts())


(500, 5) (500,)
Course
not_taken    278
taken        222
Name: count, dtype: int64


In [25]:
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(include=["int64","float64"]).columns

cat_cols, num_cols


(Index(['Sex', 'DVRT', 'Education', 'Score', 'Outcome'], dtype='object'),
 Index([], dtype='object'))

In [26]:
# convert numeric columns
df["DVRT"] = pd.to_numeric(df["DVRT"], errors="coerce")
df["Score"] = pd.to_numeric(df["Score"], errors="coerce")

# if Outcome exists and is numeric:
if "Outcome" in df.columns:
    df["Outcome"] = pd.to_numeric(df["Outcome"], errors="coerce")


In [27]:
df.dtypes


Sex           object
DVRT           int64
Education     object
Course        object
Score        float64
Outcome      float64
dtype: object

In [28]:
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(include=["int64","float64"]).columns

cat_cols, num_cols


(Index(['Sex', 'DVRT', 'Education', 'Score', 'Outcome'], dtype='object'),
 Index([], dtype='object'))

In [None]:

X = df.drop("Course", axis=1)   
y = df["Course"]                


In [30]:
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(include=["int64","float64"]).columns

cat_cols, num_cols


(Index(['Sex', 'Education'], dtype='object'),
 Index(['DVRT', 'Score', 'Outcome'], dtype='object'))

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])


preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])


In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

logreg = Pipeline([
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=2000))
])

logreg.fit(X_train, y_train)
log_preds = logreg.predict(X_test)

print("LogReg Accuracy:", accuracy_score(y_test, log_preds))
print(classification_report(y_test, log_preds))


LogReg Accuracy: 1.0
              precision    recall  f1-score   support

   not_taken       1.00      1.00      1.00        56
       taken       1.00      1.00      1.00        44

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100





In [34]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])

rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))




Random Forest Accuracy: 1.0
              precision    recall  f1-score   support

   not_taken       1.00      1.00      1.00        56
       taken       1.00      1.00      1.00        44

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100





In [35]:
from sklearn.neighbors import KNeighborsClassifier

for k in [3, 5, 11]:
    knn = Pipeline([
        ("preprocess", preprocessor),
        ("model", KNeighborsClassifier(n_neighbors=k))
    ])
    
    knn.fit(X_train, y_train)
    preds = knn.predict(X_test)
    
    print(f"\n=== KNN (k={k}) ===")
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds))



=== KNN (k=3) ===
Accuracy: 0.97
              precision    recall  f1-score   support

   not_taken       1.00      0.95      0.97        56
       taken       0.94      1.00      0.97        44

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100


=== KNN (k=5) ===
Accuracy: 0.95
              precision    recall  f1-score   support

   not_taken       1.00      0.91      0.95        56
       taken       0.90      1.00      0.95        44

    accuracy                           0.95       100
   macro avg       0.95      0.96      0.95       100
weighted avg       0.96      0.95      0.95       100


=== KNN (k=11) ===
Accuracy: 0.93
              precision    recall  f1-score   support

   not_taken       0.98      0.89      0.93        56
       taken       0.88      0.98      0.92        44

    accuracy                           0.93       100
   macro avg       0.93  

