In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 350)
import os
from glob import glob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.utils import shuffle
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import seaborn as sns

import re
from tqdm import tqdm

import pickle as pkl
from utils import *

In [2]:
DATASET_PATH = "./Story text files/"
stpwords = stopwords.words("english")
with open("./models/test_file_names.pkl", "rb") as fp:
    test_file_names = pkl.load( fp)

test_file_path = []
for name in test_file_names:
    test_file_path.append(DATASET_PATH+name)

In [50]:
def predictV6(textCorpus,
              tfidf=tf_idf_v6, 
              pca=pca_v6, 
              file_names = "input_file",
              _stopWords=stpwords,
              model=kmeans_v6,
              output_dict = pred_v6_dict):
    df = preprocessV6input(corpusList=[textCorpus], 
                                        tfidf=tf_idf_v6, 
                                        pca=pca_v6, 
                                        file_names = [file_names],
                                        _stopWords=stpwords)
    out = output_dict[model.predict(df.iloc[:,:-1])[0]]
    return "Level-"+str(out)+" of 6 levels"

def predictV7(textCorpus,
              tfidf=tf_idf_v7, 
              pca=pca_v7, 
              scaler_pos = scaler_pos_v7, 
              scaler_length = scaler_length_v7, 
              file_names = "input_file",
              _stopWords=stpwords,
              model=kmeans_v7,
              output_dict = pred_v7_dict):
    df = preprocessV7input(corpusList=[textCorpus], 
                          tfidf=tf_idf_v7, 
                          pca=pca_v7, 
                          scaler_pos = scaler_pos_v7, 
                          scaler_length = scaler_length_v7, 
                          file_names = [file_names],
                          _stopWords=stpwords)
    out = output_dict[model.predict(df.iloc[:,:-1])[0]]
    return "Level-"+str(out)+" of 6 levels"
    

<h2>Model V6</h2>

In [3]:
with open("./models/model_v6/tf_idf.model", "rb") as fp:
    tf_idf_v6 = pkl.load( fp)
with open("./models/model_v6/pca.model", "rb") as fp:
    pca_v6 = pkl.load( fp)
with open("./models/model_v6/scaler_pos.model", "rb") as fp:
    scaler_pos_v6 = pkl.load( fp)
with open("./models/model_v6/scaler_length.model", "rb") as fp:
    scaler_length_v6 = pkl.load( fp)
with open("./models/model_v6/kmeans.model", "rb") as fp:
    kmeans_v6 = pkl.load( fp)

In [4]:
test_file_names = getFileNames(test_file_path)
test_corpus_list = readFiles(test_file_path)

In [5]:
pred_v6_dict = {3:1,1:2,4:3,0:4,2:5,5:6}
df_v6 = preprocessV6input(corpusList=test_corpus_list, 
                          tfidf=tf_idf_v6, 
                          pca=pca_v6, 
                          file_names = test_file_names,
                          _stopWords=stpwords)


In [6]:
v6_preds = kmeans_v6.predict(df_v6.iloc[:,:-1])

In [7]:

v6_preds_names = np.array(test_file_names)[np.where(v6_preds==1)]
for doc in v6_preds_names:
#     print(DATASET_PATH+str(doc))
    print(readFiles([DATASET_PATH+doc], isRemoveSpecailchar=False, isToLower=False))
    print()
    print()

In [42]:
#predict your output
df_v6 = preprocessV6input(corpusList=[test_corpus_list[0]], 
                          tfidf=tf_idf_v6, 
                          pca=pca_v6, 
                          file_names = ["input_file"],
                          _stopWords=stpwords)



'Level-4 of 6 levels'

<h2>Model V7</h2>

In [9]:
with open("./models/model_v7/tf_idf.model", "rb") as fp:
    tf_idf_v7 = pkl.load( fp)
with open("./models/model_v7/pca.model", "rb") as fp:
    pca_v7 = pkl.load( fp)
with open("./models/model_v7/scaler_pos.model", "rb") as fp:
    scaler_pos_v7 = pkl.load( fp)
with open("./models/model_v7/scaler_length.model", "rb") as fp:
    scaler_length_v7 = pkl.load( fp)
with open("./models/model_v7/kmeans.model", "rb") as fp:
    kmeans_v7 = pkl.load( fp)

In [48]:
pred_v7_dict = {2:1,4:2,5:3,1:4,0:5,3:6}
df_v7 = preprocessV7input(corpusList=test_corpus_list[0:1], 
                          tfidf=tf_idf_v7, 
                          pca=pca_v7, 
                          scaler_pos = scaler_pos_v7, 
                          scaler_length = scaler_length_v7, 
                          file_names = test_file_names[0:1],
                          _stopWords=stpwords)



100%|██████████| 1/1 [00:00<00:00, 144.72it/s]


In [12]:
v7_preds = kmeans_v7.predict(df_v7.iloc[:,:-1])
v7_preds_names = np.array(test_file_names)[np.where(v7_preds==1)]

for doc in v7_preds_names:
#     print(DATASET_PATH+str(doc))
    print(readFiles([DATASET_PATH+doc], isRemoveSpecailchar=False, isToLower=False))
    print()
    print()

array([4, 1, 5, 2, 1, 4, 1, 5, 0, 4, 2, 5, 2, 4, 4, 5, 2, 4, 4, 4, 3, 1,
       4, 4, 4, 2, 5, 1, 5, 5, 4, 2, 3, 0, 1, 3, 4, 2, 5, 4, 4, 5, 2, 2],
      dtype=int32)

<h2>Prediction for V7</h2>
<h3>Input text corpus to check the level</h3>

In [59]:
predictV7(test_corpus_list[0])
#input the text corpus in the function
#predictV7( INPUT_CORPUS )

100%|██████████| 1/1 [00:00<00:00, 147.38it/s]


'Level-2 of 6 levels'

<h2>Prediction for V6</h2>
<h3>Input text corpus to check the level</h3>

In [60]:
predictV6(test_corpus_list[3])
#input the text corpus in the function
#predictV6( INPUT_CORPUS )

'Level-4 of 6 levels'