In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import spacy
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_tidy_text(_txt):
    _doc = DBAS_NLP(_txt)
    _tidy_word = [token.text for token in _doc if not token.is_stop]
    return " ".join(_tidy_word)

def get_lemma_pos(_txt):
    _doc = DBAS_NLP(_txt)
    _d = {"_POS": [], "_LEMMA": []}
    for _token in _doc:
        _d["_POS"].append(_token.pos_)
        _d["_LEMMA"].append(_token.lemma_)
    return _d

def _generate_word_cloud(_txt):
    x, y = np.ogrid[:300, :300]
    mask = (x - 150) ** 2 + (y - 150) ** 2 > 130**2
    mask = 255 * mask.astype(int)
    wc = WordCloud(background_color="white", repeat=True, mask=mask)
    wc.generate(_txt)
    plt.axis("off")
    plt.imshow(wc, interpolation="bilinear")
    plt.show()

def get_word_rep(_word, _rep):
    repeated_word = f"{_word} " * _rep
    return repeated_word

In [3]:
df_challenges = pl.read_csv(
    r"/Users/malleshamyamulla/Desktop/SSBBA/MBB_PROJECT/data/dbas_sds_challenges.csv"
).with_row_count()

los_samples = []
for _ in range(10):
    _df = df_challenges.sample(10, with_replacement=False,seed=100)
    los_samples.append(_df)

df_conso = pl.concat(los_samples).to_pandas()

DBAS_NLP = spacy.load(
    "en_core_web_sm"
)

df_conso["_tidy_text_1"] = df_conso["#CHALLENGE"].apply(lambda x: get_tidy_text(x))

los_doc_dicts = []
for _sen in df_conso["_tidy_text_1"]:
    _d = get_lemma_pos(_sen)
    los_doc_dicts.append(_d)

los_dfs = []
for _d in los_doc_dicts:
    _DF = pd.DataFrame(_d)
    los_dfs.append(_DF)

DF_POS = pd.concat(los_dfs)

DF_POS_TIDY = DF_POS[DF_POS["_POS"] != "PUNCT"]

_LOS_NOUNS = (
    DF_POS_TIDY[DF_POS_TIDY["_POS"] == "NOUN"]
    .groupby("_LEMMA")
    .count()
    .reset_index()
    .sort_values("_POS", ascending=False)
)

_ADJCT = (
    DF_POS_TIDY[DF_POS_TIDY["_POS"] == "ADJ"]
    .groupby("_LEMMA")
    .count()
    .reset_index()
    .sort_values("_POS", ascending=False)
)

_VERBS = (
    DF_POS_TIDY[DF_POS_TIDY["_POS"] == "VERB"]
    .groupby("_LEMMA")
    .count()
    .reset_index()
    .sort_values("_POS", ascending=False)
)

LOS_VERBS = _VERBS[["_LEMMA", "_POS"]].apply(lambda x: get_word_rep(x[0], x[1]), axis=1)
LOS_NOUNS = _LOS_NOUNS[["_LEMMA", "_POS"]].apply(lambda x: get_word_rep(x[0], x[1]), axis=1)
LOS_ADJ = _ADJCT[["_LEMMA", "_POS"]].apply(lambda x: get_word_rep(x[0], x[1]), axis=1)

todos_verb_list = []
for _verb in LOS_VERBS:
    _vs = _verb.split(" ")
    todos_verb_list.extend(_vs)

todos_noun_list = []
for _verb in LOS_NOUNS:
    _vs = _verb.split(" ")
    todos_noun_list.extend(_vs)

todos_adj_list = []
for _verb in LOS_ADJ:
    _vs = _verb.split(" ")
    todos_adj_list.extend(_vs)

text_verb = " ".join([_word for _word in todos_verb_list if len(_word) != 0])
text_noun = " ".join([_word for _word in todos_noun_list if len(_word) != 0])
text_adj = " ".join([_word for _word in todos_adj_list if len(_word) != 0])

In [4]:
text_verb

'extract extract extract extract extract extract extract extract extract extract extract extract extract extract extract extract extract extract extract extract take take take take take take take take take take take take take take take take take take take take consume consume consume consume consume consume consume consume consume consume handle handle handle handle handle handle handle handle handle handle'

In [None]:
df_PII=pd.read_excel(r'/Users/malleshamyamulla/Desktop/SSBBA/MBB_PROJECT/data/class_pii_phi_tiny_v2.xlsx',sheet_name='PIIPHI')
df_NOPII=pd.read_excel(r'/Users/malleshamyamulla/Desktop/SSBBA/MBB_PROJECT/data/class_pii_phi_tiny_v2.xlsx',sheet_name='NOPIIPHI')

In [None]:
df_NOPII.head()

In [None]:
df_PII.head()