In [22]:
import pandas as pd
import numpy as np
import seaborn as sns

import warnings #to ignore warnings while run
warnings.filterwarnings('ignore')

In [23]:
data=pd.read_csv('data.csv')
data.head()

Unnamed: 0,password,strength
0,zxe870819,1
1,xw46454nr23l,1
2,soporte13,1
3,accounts6000webhost.com,2
4,c443balg,1


In [24]:
data['strength'].unique()

array([1, 2, 0], dtype=int64)

In [25]:
data.isna().sum()

password    0
strength    0
dtype: int64

In [26]:
data[data['password'].isnull()]

Unnamed: 0,password,strength


In [27]:
data.dropna(inplace=True)

In [28]:
data.isna().sum()

password    0
strength    0
dtype: int64

In [30]:
sns.countplot(data['strength'])

In [31]:
password_tuple=np.array(data)

In [32]:
import random
random.shuffle(password_tuple)

In [33]:
x=[labels[0] for labels in password_tuple]
y=[labels[1] for labels in password_tuple]

In [34]:
def word_divide_char(inputs):
    character=[]
    for i in inputs:
        character.append(i)
    return character

In [35]:
word_divide_char('j09000')

['j', '0', '9', '0', '0', '0']

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
vectorizer=TfidfVectorizer(tokenizer=word_divide_char)

In [40]:
X=vectorizer.fit_transform(x)

In [41]:
X.shape

(100000, 79)

In [43]:
vectorizer.get_feature_names_out()

array([' ', '!', '#', '$', '%', '&', '(', ')', '*', '+', '-', '.', '/',
       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ';', '<', '=',
       '>', '?', '@', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd', 'e',
       'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
       's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '~', '°', '±',
       '³', '´', 'µ', 'ß', 'ä', 'æ', 'ç', 'ñ', 'õ', 'ö', '÷', 'ú', 'ý',
       '›'], dtype=object)

In [44]:
first_document_vector=X[0]

In [45]:
first_document_vector.T.todense()

matrix([[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.27305021],
        [0.20890319],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.31270211],
        [0.60811102],
        [0.28966443],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.23479928],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [47]:
df=pd.DataFrame(first_document_vector.T.todense(),index=vectorizer.get_feature_names_out(),columns=['TF-IDF'])
df.sort_values(by=['TF-IDF'],ascending=False)

Unnamed: 0,TF-IDF
8,0.608111
x,0.380606
z,0.361004
7,0.312702
9,0.289664
...,...
[,0.000000
@,0.000000
?,0.000000
>,0.000000


In [48]:
from sklearn.model_selection import train_test_split

In [49]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

In [50]:
X_train.shape

(80000, 79)

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
clf=LogisticRegression(random_state=0,multi_class='multinomial')

In [53]:
clf.fit(X_train,y_train)

In [54]:
dt=np.array(['!@23abc'])
pred=vectorizer.transform(dt)
clf.predict(pred)

array([2])

In [55]:
y_pred=clf.predict(X_test)
y_pred

array([1, 1, 1, ..., 2, 2, 1])

In [56]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [57]:
cm=confusion_matrix(y_test,y_pred)
print(cm)
print(accuracy_score(y_test,y_pred))

[[  755  1916     3]
 [  526 13909   392]
 [    9   839  1651]]
0.81575


In [58]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.59      0.28      0.38      2674
           1       0.83      0.94      0.88     14827
           2       0.81      0.66      0.73      2499

    accuracy                           0.82     20000
   macro avg       0.74      0.63      0.66     20000
weighted avg       0.80      0.82      0.80     20000



In [59]:
from sklearn import metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: '+str(round(accuracy*100,2))+'%')

Accuracy: 81.58%


### Predictions

In [60]:
predict_data = np.array(['123@#ghhgj'])
prediction = vectorizer.transform(predict_data)
clf.predict(prediction)

array([1])

In [61]:
predict_data = np.array(['j'])
prediction = vectorizer.transform(predict_data)
clf.predict(prediction)

array([0])

In [62]:
predict_data = np.array(['123@j'])
prediction = vectorizer.transform(predict_data)
clf.predict(prediction)

array([1])