In [None]:
%load_ext autoreload   
%autoreload 2

from src import *
import dill
import nltk
import os
nltk.download('all')

In [None]:
settings = Settings()

In [None]:
dump_dict = None
with open(settings.paths.ws_dataset_dump_file, 'rb') as file:
    dump_dict = dill.load(file)
    
dataset = dump_dict['dataset']
preprocessing_results = dump_dict['preprocessing_results']
metrics_analysis_results = dump_dict['metrics_analysis_results']
feature_extractor = dump_dict['feature_extractor']
entropy_analysis_results = dump_dict['entropy_analysis_results']
pca_analysis_results = dump_dict['pca_analysis_results']

In [None]:
dataset = WritingStyleDataset(settings)
dataset.load()

In [None]:
dataset.clean()

In [None]:
print(f"Text removed while cleaning: {WritingStyleMetadataAnalysis.get_percentage_of_removed_text(dataset)}%.")

In [None]:
preprocessing_results = WritingStylePreprocessing(settings).preprocess(dataset)
preprocessing_results.info()

In [None]:
metrics_analysis_results = WritingStyleMetricsAnalysis().analyze(preprocessing_results)

In [None]:
feature_extractor = FeatureExtractor(
    settings=settings, 
    metrics_analysis_results=metrics_analysis_results
)
feature_extractor.get_feature_names()

In [None]:
metrics_analysis_visualization = WritingStyleMetricsAnalysisVisualization(
    settings=settings, 
    metrics_analysis_results=metrics_analysis_results,
    feature_extractor=feature_extractor
)
metrics_analysis_visualization.visualize()

In [None]:
metrics_analysis_visualization.dash_app.run(port=8000)

In [None]:
entropy_analysis = WritingStyleEntropyAnalysis(
    settings=settings,
    feature_extractor=feature_extractor,
)
entropy_analysis_results = entropy_analysis.analyze(
    preprocessing_results=preprocessing_results,
    metrics_analysis_results=metrics_analysis_results
)

In [None]:
entropy_analysis = WritingStyleEntropyAnalysis(
    settings=settings,
    feature_extractor=feature_extractor,
)
entropy_analysis._calculate_entropies_average_data(entropy_results=entropy_analysis_results)

In [15]:
WritingStyleEntropyAnalysisVisualization(
    entropy_analysis_results=entropy_analysis_results,
    feature_extractor=feature_extractor
).visualize()

In [None]:
# WritingStyleEntropyAnalysisVisualization(
#     preprocessing_results=preprocessing_results,
#     entropy_analysis_results=entropy_analysis_results
# ).run(port=8001)

In [None]:
pca_analysis_results = WritingStylePCAAnalysis(
    settings=settings,
    feature_extractor=feature_extractor
).get_pca_analysis(metrics_analysis_results=metrics_analysis_results)

In [None]:
pca_analysis_visualization = WritingStylePCAAnalysisVisualization(pca_analysis_results=pca_analysis_results)
pca_analysis_visualization.visualize_top_features(pca_analysis_results)

In [None]:
pca_analysis_visualization.dash_app.run(port=8002)

In [None]:
if os.path.exists(settings.paths.ws_dataset_dump_file):
    os.remove(settings.paths.ws_dataset_dump_file)
with open(settings.paths.ws_dataset_dump_file, 'wb') as file:
    dill.dump(
        obj={
            "dataset": dataset,
            "preprocessing_results": preprocessing_results,
            "metrics_analysis_results": metrics_analysis_results,
            "feature_extractor": feature_extractor,
            "entropy_analysis_results": entropy_analysis_results,
            "pca_analysis_results": pca_analysis_results,
        },  
        file=file
    )