In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm


def load_and_prepare_data(file_path, sample_size):
    df = pd.read_csv(file_path, error_bad_lines=False)
    df = df.dropna()

    X = df["password"]
    y = df["strength"]

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    indices = np.random.choice(len(X_train), size=sample_size, replace=False)
    X_train = X_train.iloc[indices]
    y_train = y_train[indices]

    return X_train, X_test, y_train, y_test, label_encoder


data_file = 'drive/MyDrive/ML_PROJECT_DATASET/data.csv'
desired_sample_size = 100000


X_train, X_test, y_train, y_test, label_encoder = load_and_prepare_data(data_file, desired_sample_size)

In [None]:
# Vectorize the passwords
vectorizer = CountVectorizer(analyzer="char")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = RandomForestClassifier()
# n_estimators: default=100
# max_depth: default=None
# min_samples_split: default=2
# random_state: default=None

with tqdm(total=X_train_vec.shape[0], desc="Training", unit="samples") as pbar:
    model.fit(X_train_vec, y_train)
    pbar.update(X_train_vec.shape[0])

y_pred = model.predict(X_test_vec)

y_pred_decoded = label_encoder.inverse_transform(y_pred)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Training: 100%|██████████| 100000/100000 [03:17<00:00, 505.09samples/s]


Accuracy: 0.8623066125082134


In [None]:
#test_password = "To#rkw1zxXzfY7$4^*89bHs7Xb5!#A!4Ve8xGb4jW9arQdU61k" # strong
#test_password = "LIserEptInguISEd123" # medium
test_password = "hello"  # weak

# Vectorize the test password
test_password_vec = vectorizer.transform([test_password])

predicted_class = model.predict(test_password_vec)
predicted_strength = label_encoder.inverse_transform(predicted_class)[0]

print(f"The strength of the test password '{test_password}' is classified as: {predicted_strength}")