In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

class BasicPipeline:
    def __init__(self, data, target):
        self.data = data
        self.target = target

    def load_data(self):
        X = self.data.drop(columns=[self.target])
        y = self.data[self.target]
        return X, y

    def preprocess_data(self, X):
        numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = X.select_dtypes(include=['object']).columns

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)])

        return preprocessor

    def split_data(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test

    def train_model(self, X_train, y_train, preprocessor):
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier())])
        clf.fit(X_train, y_train)
        return clf
    
    def evaluate_model(self, clf, X_test, y_test):
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        return score

    def run(self):
        X, y = self.load_data()
        preprocessor = self.preprocess_data(X)
        X_train, X_test, y_train, y_test = self.split_data(X, y)
        clf = self.train_model(X_train, y_train, preprocessor)
        score = self.evaluate_model(clf, X_test, y_test)
        return score

In [17]:
# Assuming you have the data in 'train.csv'
train_df = pd.read_csv("train.csv")
pipeline = BasicPipeline(data=train_df, target='defects')
score = pipeline.run()
print(f"Model accuracy: {score}")


Model accuracy: 0.810691298580062
