In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

In [3]:
from google.colab import files
uploaded = files.upload()

Saving passwordDataset.csv to passwordDataset.csv


In [4]:
# read in the dataset

pwdf = pd.read_csv('passwordDataset.csv')

In [5]:
# print 1st twenty examples

pwdf.head(20)

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1
5,AVYq1lDE4MgAZfNt,2
6,u6c8vhow,1
7,v1118714,1
8,universe2908,1
9,as326159,1


## exploratory data analysis

the first twenty examples in this dataset show passwords with a value of 1 in the `strength` column.

what values are possible?

In [27]:
# print first 50 values of the 'strength' column

strength_vals = pwdf['strength'].tolist()

print(strength_vals[:51])

[1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 2, 1, 1, 0]


at first glance, it looks like there are 3 potential categories in the dataset: `[0,1,2]`

let's grab the *unique* values from the column to be sure:

In [8]:
uniq_strength_vals = pwdf.strength.unique()

print(uniq_strength_vals)

[1 2 0]


we do indeed have three classes of password strength here: 

`[0,1,2]`

indicating a *multiclass classification* task.

### random shuffle

i'll be shuffling the dataset randomly to (hopefully) make sure the examples are as evenly distributed as possible.

i prefer to use the `sklearn` shuffle utility, as it allows for taking advantage of sklearn's pipelines, which is useful for production ML & reproducibility.

In [9]:
shuffled_pwdf = shuffle(pwdf, random_state=29)

In [10]:
print(shuffled_pwdf.head(20))

                password  strength
173361       sadullah500         1
101200         ifuwaf809         1
440944  supPYtzM4NAiHoaq         2
183963         agodyl344         1
243213          nwgosqu8         1
622660            jatb9c         0
286099  cG7BNpDgzMQobTPG         2
555622          522002kc         1
420532  aCWDFgTQ2OQYxeb4         2
159954          uutu0y97         1
538696        ibyvifo493         1
40405          dinan2010         1
29391          aux0wh911         1
248196         sublime23         1
306188         qmpzalla1         1
533250  mhfG9STczOAbPHxb         2
121021      jiachenjun03         1
505453            erkor6         0
442935        yxyzelu992         1
479032  H8mvktjkxMABJuY3         2


so now the dataset has been shuffled, and we can verify that there are examples from all three classes in the first twenty rows.

### train test split

now to split the dataset into training & testing data.

i like to use the `train_test_split` module from the `sklearn` library, again for pipeline & reproducibility reasons:

In [11]:
X_train, X_test, y_train, y_test = train_test_split(shuffled_pwdf['password'], shuffled_pwdf['strength'], test_size=0.3, random_state=29)

create a function that splits each password string into individual characters:

In [12]:
def get_char_tokens(input_str):
    '''
    takes input string
    splits into individual characters
    returns characters
    '''
    ind_chars = [x for x in input_str]
    
    return ind_chars

### TF-IDF processing & model training pipeline

because passwords are fundamentally text data, we will need to split them into tokens & vectorize them.

TF-IDF is a useful vectorization for data of this type, so i'll be implementing it via the `sklearn.feature_extraction` module.

i'll create a `Pipeline` to perform remaining preprocessing & model training, including splitting passwords into individual characters & performing TF-IDF vectorization

**pipeline steps:**

* split passwords to character tokens using the `get_char_tokens` function i created above

* vectorize tokens using `TfidfVectorizer` from `sklearn`

* train the classifier

In [13]:
# create the pipeline

pwd_clf = Pipeline([("vect", TfidfVectorizer(tokenizer=get_char_tokens)), ("clf", XGBClassifier()),]
)

In [15]:
# train the classifier
# score performance

pwd_clf.fit(X_train, y_train)
pwd_clf.score(X_test, y_test)

0.9125948270712622

In [23]:
# test on a high entropy password & a weak password

weakest = 'qwerty'
weak = 'passw0rd'
hi_entropy = 'sD4cnj83kfkIA382Kd93DiJdf72'

In [24]:
# get predictions

pwd_clf.predict([weakest,weak,hi_entropy])

array([0, 1, 2])