In [36]:
# Basic data handling and numerical processing
import pandas as pd                 # For data manipulation and analysis
import time                         # For timing code execution

# Scikit-learn - Data preprocessing and feature selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler  # For feature scaling
from sklearn.feature_selection import SelectKBest, f_classif    # For feature selection

# Scikit-learn - Model selection and evaluation
from sklearn.model_selection import train_test_split            # For splitting data into training and testing sets
from sklearn.metrics import confusion_matrix, accuracy_score    # For model evaluation

# Scikit-learn - Machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier  # Ensemble methods
from sklearn.svm import SVC                                          # Support Vector Machine classifier

# Scikit-learn - Pipeline utilities
from sklearn.pipeline import Pipeline          # For creating machine learning pipelines
from sklearn.compose import ColumnTransformer  # For applying transformers to columns of Pandas dataframes

# Scikit-learn - Dataset generation
from sklearn.datasets import make_classification  # For generating a random n-class classification problem


In [27]:
# Make a fake an complex dataset
X, y = make_classification(n_samples=100000,# Large number of samples
                           n_classes=5,
                           n_features=100,    # Large number of features
                           n_informative=50,  # Number of informative features
                           n_redundant=10,    # Number of redundant features
                           n_clusters_per_class=3,  # Complexity in data
                           random_state=42)

In [37]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.249074,-1.662979,3.620188,-2.026560,1.444258,-1.604939,-7.536499,-3.347722,-0.671021,-0.165953,...,2.274136,-7.732157,19.526272,-3.670708,0.689832,8.363257,2.605347,46.894223,-5.705433,0.007832
1,-1.487609,1.086423,-7.084398,1.701420,-3.667561,1.396777,-1.884257,6.509376,-0.008781,1.748435,...,-1.294777,2.874897,-20.886530,-2.582731,2.119685,-4.562733,2.455793,-20.112751,1.963437,0.727768
2,1.748358,2.259034,2.756386,-0.550596,6.249146,2.389290,-3.277544,1.760280,0.169276,0.365308,...,-0.243318,7.236897,0.866962,-3.191125,1.413882,-1.946430,1.856227,0.799218,3.892423,1.440825
3,0.619607,0.492218,-5.005010,-3.630852,0.573500,-2.928691,-2.063202,1.430140,-0.013358,0.353652,...,0.730538,-4.858785,15.227895,7.877092,0.515281,4.949889,2.680786,14.099886,7.892850,0.220872
4,0.290803,-3.608448,5.608295,-0.046357,4.531516,-0.187625,1.207627,0.766376,-0.864373,-1.172826,...,-0.709476,-0.595337,7.506311,5.039526,0.462610,-4.940043,-2.834191,26.225681,0.928784,1.398901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,-0.010659,1.777973,-0.312340,1.357472,4.885210,-0.498488,1.305339,-2.630185,0.181239,0.708548,...,0.418166,1.670129,0.053675,2.274703,-0.910542,-6.214790,-2.025914,-6.356494,3.496883,-0.631609
99996,-1.378788,3.989405,-2.016169,0.713512,2.141328,0.051475,1.988437,2.865841,0.578019,-0.334300,...,0.095726,-2.281722,-24.816425,-4.251613,1.120953,-2.698653,-2.879288,-14.415476,-0.074965,-0.597848
99997,-0.013823,0.224935,4.193758,0.330235,8.345118,0.370281,2.696467,6.244126,-0.283730,0.084403,...,0.184015,2.677937,-10.257050,-3.806622,-0.555677,-3.534904,-5.756449,-3.938316,-6.555128,1.286027
99998,-0.155785,2.437995,7.001220,0.670684,2.294471,-1.061331,-1.080551,8.998209,0.645305,1.179597,...,2.487604,1.204669,-34.725457,-3.118910,-0.503067,-5.817525,2.007334,17.565355,0.602449,0.069982


In [38]:
# Split your Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
X_train.shape

(80000, 100)

In [40]:
y_train

array([3, 0, 0, ..., 4, 2, 3])

# Chat GPT

In [None]:
# Create a pipeline with a fast solver and limited iterations
logistic_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(solver='saga', max_iter=100, n_jobs=-1))
])

# Fit the pipeline on your training data
logistic_pipeline.fit(X_train, y_train)

# Make predictions
predictions = logistic_pipeline.predict(X_test)

In [None]:
# yen Usage Limits

![image.png](attachment:51a2d1ed-0f2d-46c6-879c-4112847aa13f.png)

# Why Cores Matter

## 10 Cores

In [41]:
# Create a pipeline
time_start=time.time()

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('classifier', RandomForestClassifier(random_state=42,n_jobs=10))  # Classifier
])
# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

time_end=time.time()
total_time=(time_end-time_start)
print(f'Total Time: {round(total_time,2)} seconds',)
pipeline

Total Time: 12.71 seconds


## 40 Cores

In [42]:
# Create a pipeline
time_start=time.time()

pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('classifier', RandomForestClassifier(random_state=42,n_jobs=40))  # Classifier
])
# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

time_end=time.time()
total_time=(time_end-time_start)
print(f'Total Time: {round(total_time,2)} seconds')
pipeline

Total Time: 4.08 seconds


In [None]:
# More Advanced

In [50]:
time_start=time.time()
column_transformer = ColumnTransformer(n_jobs=48,
    transformers=[
        ('standard_scaler', StandardScaler(), list(range(0, 50))),  # Apply StandardScaler to the first two features
        ('min_max_scaler', MinMaxScaler(), list(range(51, 100)))     # Apply MinMaxScaler to the last two features
    ])

feature_selection = SelectKBest(score_func=f_classif, k=50)
pipeline_complex = Pipeline([
    ('transformer', column_transformer),
    ('feature_selection', feature_selection),
    #('classifier', RandomForestClassifier(random_state=42,n_jobs=48)),
    ('logistic', LogisticRegression(solver='sag',max_iter=500, n_jobs=48))
])

pipeline_complex.fit(X_train,y_train)

time_end=time.time()
total_time=(time_end-time_start)
print(f'Total Time: {round(total_time,2)} seconds')
pipeline_complex

Total Time: 19.6 seconds


In [51]:
pipeline_complex['logistic']

# Predictions

In [46]:
predictions_simple = pipeline.predict(X_test)

In [52]:
predictions_complex= pipeline_complex.predict(X_test)

# Scoring

In [48]:
accuracy_score(y_test,predictions_simple)

0.74825

In [53]:
accuracy_score(y_test,predictions_complex)

0.47095