# Importing the necessary Libraries:

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
sns.set_theme(style="whitegrid", palette="muted")

In [4]:
data1=pd.read_csv("passw_dataset.csv",error_bad_lines=False)

b'Skipping line 2810: expected 2 fields, saw 5\nSkipping line 4641: expected 2 fields, saw 5\nSkipping line 7171: expected 2 fields, saw 5\nSkipping line 11220: expected 2 fields, saw 5\nSkipping line 13809: expected 2 fields, saw 5\nSkipping line 14132: expected 2 fields, saw 5\nSkipping line 14293: expected 2 fields, saw 5\nSkipping line 14865: expected 2 fields, saw 5\nSkipping line 17419: expected 2 fields, saw 5\nSkipping line 22801: expected 2 fields, saw 5\nSkipping line 25001: expected 2 fields, saw 5\nSkipping line 26603: expected 2 fields, saw 5\nSkipping line 26742: expected 2 fields, saw 5\nSkipping line 29702: expected 2 fields, saw 5\nSkipping line 32767: expected 2 fields, saw 5\nSkipping line 32878: expected 2 fields, saw 5\nSkipping line 35643: expected 2 fields, saw 5\nSkipping line 36550: expected 2 fields, saw 5\nSkipping line 38732: expected 2 fields, saw 5\nSkipping line 40567: expected 2 fields, saw 5\nSkipping line 40576: expected 2 fields, saw 5\nSkipping line 

# Gathering some information about our data:

In [5]:
#Lets have a look at the features
data1.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


As we can see, the data has two features, namely 'password' and 'strength'.

In [44]:
data1.shape

(669640, 2)

The dataset is huge, with 669640 entries, and two features as described above.

In [45]:
#Checking the unique values in strength
data1["strength"].unique()

array([1, 2, 0], dtype=int64)

Here '0' indicates a weak password, '1' indicates a decent password of medium strength and '2' indicates a strong password.

In [6]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669640 entries, 0 to 669639
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   password  669639 non-null  object
 1   strength  669640 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 10.2+ MB


In [49]:
#Now we check for null values to drop
data1.isnull().sum()

password    1
strength    0
dtype: int64

There is one null value found in 'password'.

In [50]:
##Lets check the record where our value is null
data1[data1["password"].isnull()]

Unnamed: 0,password,strength
367579,,0


Now we simply drop this NaN value with the 'dropna' command.

In [None]:
data1.dropna(inplace=True)

In [None]:
plt.figure(figsize=(5,5))
sns.swarmplot(data1.strength, x="Strength", y="Count")

The number of medium strength values is far higher than those of weak and strong ones.

# Converting data to a numpy array:

In [55]:
pass1=np.array(data1)
pass1

array([['kzde5577', 1],
       ['kino3434', 1],
       ['visi7k1yr', 1],
       ...,
       ['184520socram', 1],
       ['marken22a', 1],
       ['fxx4pw4g', 1]], dtype=object)

In [56]:
pass1[0]

array(['kzde5577', 1], dtype=object)

This is the password stored at the 0th index.

In [57]:
import random
random.shuffle(pass1)
pass1


array([['kzde5577', 1],
       ['kzde5577', 1],
       ['kzde5577', 1],
       ...,
       ['megmel9597', 1],
       ['amrutham852', 1],
       ['rukol725', 1]], dtype=object)

In [58]:
X=[passwords[0] for passwords in pass1]
y=[passwords[1] for passwords in pass1]

Here we have created independent and dependent features.

In [65]:
##Convert words into characters
def charsplitter(inputs):
    characters=[]
    for letter in inputs:
        characters.append(letter)
    return characters

In [19]:
make_chars("Akhil Nair")

['A', 'k', 'h', 'i', 'l', ' ', 'N', 'a', 'i', 'r']

As you can see, the function has split my name into individual characters.

In [66]:
vectorizer=TfidfVectorizer(tokenizer=charsplitter)

In [67]:
X_=vectorizer.fit_transform(X)

In [68]:
X_.shape

(669639, 123)

As we can see, our column number has gone up from 2 to 123.

In [23]:
vectorizer.get_feature_names()

['\x04',
 '\x05',
 '\x08',
 '\x0e',
 '\x10',
 '\x16',
 '\x17',
 '\x19',
 '\x1b',
 '\x1c',
 '\x1e',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 '(',
 ')',
 '*',
 '+',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '\x7f',
 '\x81',
 '\x8d',
 '\xa0',
 '¡',
 '¢',
 '¨',
 '«',
 '°',
 '±',
 '²',
 '³',
 '´',
 'µ',
 '·',
 'º',
 '¼',
 '¾',
 '¿',
 '×',
 'ß',
 'à',
 'á',
 'â',
 'ä',
 'å',
 'æ',
 'ç',
 'è',
 'ê',
 'í',
 'ï',
 'ð',
 'ñ',
 'ò',
 'ó',
 'ô',
 'õ',
 'ö',
 '÷',
 'ù',
 'ú',
 'û',
 'ü',
 'ý',
 'þ',
 'ÿ',
 '‚',
 '™']

In [69]:
first_=X_[0].T.todense()

In [72]:
vec1=pd.DataFrame(first_,index=vectorizer.get_feature_names(),columns=['tfidf'])

'TF' and 'IDF' stand for Term Frequency and Inverse Document Frequency. The product of these two is called 'TFIDF'. This helps us remove repetitive words such as "the", "and", "a" and so on.

In [73]:
vec1

Unnamed: 0,tfidf
,0.0
,0.0
,0.0
,0.0
,0.0
...,...
ÿ,0.0
ƒ,0.0
—,0.0
‚,0.0


In [35]:
vec.sort_values(by=['tfidf'],ascending=False)

Unnamed: 0,tfidf
7,0.592341
5,0.566258
z,0.335973
k,0.291184
d,0.286524
...,...
?,0.000000
>,0.000000
=,0.000000
<,0.000000


This is our sparse matrix dataframe in decreasing order, with tfidf values for each character.

In [74]:
x_train,x_test,y_train,y_test=train_test_split(X_,y,test_size=0.20,random_state=42)

In [75]:
x_train.shape,x_test.shape

((535711, 123), (133928, 123))

# Model Creation:

In [None]:
#We choose these 4 models and then decide which of the 4 works the best:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score 
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [None]:
classifier=[]
classifier.append(xgb.XGBClassifier(n_jobs=-1))
classifier.append(MultinomialNB())
classifier.append(LogisticRegression(multi_class='ovr',n_jobs=-1)) 

Here 'n_jobs' utilizes all CPU cores, and 'OVR' means One versus the Rest. This classifier list is an instance of these models.

In [None]:
classifier.append(LogisticRegression(multi_class='multinomial',solver='newton-cg',n_jobs=-1))

Here 'multinomial means that the loss minimized is the loss fit across the entire probability distribution.

In [None]:
result=[]
for model in classifier:
    a=model.fit(x_train,y_train)
    result.append(a.score(x_test,y_test))

In [None]:
result1=pd.DataFrame({'score':result,
                      'algorithms':['logistic_regr_ovr',
                                    'logistic_regr_mutinomial',
                                    'xgboost','naive bayes']})

The result is now stored in a dataframe. The accuracy for each of the models can be see below:

In [None]:
result1

We shall hence go for the XGBoost algorithm as it does the best job

In [1]:
xgb_classifier=xgb.XGBClassifier(n_jobs=-1)
xgb_classifier.fit(x_train,y_train)

NameError: name 'xgb' is not defined

The model is now trained on the training data.

In [None]:
pred1=xgb_classifier.predict(x_test)
from sklearn.metrics import confusion_matrix,classification_report
confusion_matrix(y_test,pred1)
print(classification_report(y_test,pred1))

# Serialization using dill:

In [None]:
import dill
model_file=open("xgb_classifier.pkl","wb")
dill.dump(xgb_classifier,model_file)
model_file.close()

We dump the XGB model in dill and close the file.

In [None]:
dill.dump(vectorizer, open("vectorizer.pkl", "wb"))

In [None]:
password="Za_123%@#%B1"

We now enter a password to test our model.

In [None]:
password=vectorizer.transform([password])

In [None]:
xgb_classifier.predict(password)

This tells us that the password chosen is strong.

P.S.: Dataset has been taken from Kaggle - https://www.kaggle.com/bhavikbb/password-strength-classifier-dataset
