<a href="https://colab.research.google.com/github/gmadhuri10/colab-python-tasks/blob/main/ICP_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# 1. Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 3. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 4. GridSearchCV
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

# 5. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00


Check for 3 fold, 5 fold and 7 fold cross validation

Replace classifier, SVC with RandomForestClassifier and LogisticRegression, Perceptron, knn .

Update the param_grid accordingly (e.g., for RandomForestClassifier, use n_estimators, max_depth, etc.)

Also replace Gridsearch with randomnsearch function.

Relplace with with your own csv dataset using code below:

In [4]:
from google.colab import files
uploaded = files.upload()


Saving pd_speech_features.csv to pd_speech_features.csv


In [7]:
import pandas as pd

data = pd.read_csv("pd_speech_features.csv")
print(data.head())


   id  gender      PPE      DFA     RPDE  numPulses  numPeriodsPulses  \
0   0       1  0.85247  0.71826  0.57227        240               239   
1   0       1  0.76686  0.69481  0.53966        234               233   
2   0       1  0.85083  0.67604  0.58982        232               231   
3   1       0  0.41121  0.79672  0.59257        178               177   
4   1       0  0.32790  0.79782  0.53028        236               235   

   meanPeriodPulses  stdDevPeriodPulses  locPctJitter  ...  \
0          0.008064            0.000087       0.00218  ...   
1          0.008258            0.000073       0.00195  ...   
2          0.008340            0.000060       0.00176  ...   
3          0.010858            0.000183       0.00419  ...   
4          0.008162            0.002669       0.00535  ...   

   tqwt_kurtosisValue_dec_28  tqwt_kurtosisValue_dec_29  \
0                     1.5620                     2.6445   
1                     1.5589                     3.6107   
2          

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import randint, uniform

# 1. Load dataset
data = pd.read_csv("pd_speech_features.csv")

# 2. Set features and target
X = data.drop(columns=['id', 'gender'])  # drop non-feature columns
y = data['gender']  # CHANGE this if gender isn't your target variable

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Classifier configurations
classifiers = {
    "RandomForest": (
        RandomForestClassifier(),
        {
            'classifier__n_estimators': randint(50, 200),
            'classifier__max_depth': randint(3, 20),
            'classifier__min_samples_split': randint(2, 10)
        }
    ),
    "LogisticRegression": (
        LogisticRegression(max_iter=1000),
        {
            'classifier__C': uniform(0.01, 10),
            'classifier__penalty': ['l2'],
            'classifier__solver': ['lbfgs', 'saga']
        }
    ),
    "Perceptron": (
        Perceptron(max_iter=1000),
        {
            'classifier__penalty': ['l2', 'elasticnet', None],
            'classifier__alpha': uniform(0.0001, 0.01),
        }
    ),
    "KNN": (
        KNeighborsClassifier(),
        {
            'classifier__n_neighbors': randint(3, 15),
            'classifier__weights': ['uniform', 'distance'],
            'classifier__p': [1, 2]
        }
    )
}

# 5. Run with 3, 5, 7-fold cross-validation
cv_values = [3, 5, 7]

for name, (clf, param_dist) in classifiers.items():
    print(f"\n===== {name} =====")
    for cv in cv_values:
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=5)),  # you can increase or tune this
            ('classifier', clf)
        ])

        search = RandomizedSearchCV(pipe, param_distributions=param_dist,
                                    n_iter=10, cv=cv, n_jobs=-1, random_state=42)
        search.fit(X_train, y_train)

        print(f"CV = {cv}")
        print("Best Params:", search.best_params_)
        print("Best CV Score: {:.2f}".format(search.best_score_))
        print("Test Score: {:.2f}".format(search.score(X_test, y_test)))



===== RandomForest =====
CV = 3
Best Params: {'classifier__max_depth': 12, 'classifier__min_samples_split': 9, 'classifier__n_estimators': 64}
Best CV Score: 0.84
Test Score: 0.84
CV = 5
Best Params: {'classifier__max_depth': 9, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 124}
Best CV Score: 0.85
Test Score: 0.84
CV = 7
Best Params: {'classifier__max_depth': 9, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 124}
Best CV Score: 0.85
Test Score: 0.86

===== LogisticRegression =====
CV = 3
Best Params: {'classifier__C': np.float64(1.844347898661638), 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
Best CV Score: 0.82
Test Score: 0.83
CV = 5
Best Params: {'classifier__C': np.float64(0.5908361216819946), 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
Best CV Score: 0.82
Test Score: 0.84
CV = 7
Best Params: {'classifier__C': np.float64(3.7554011884736247), 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best CV Score: 0