In [1]:
import pandas as pd
import re

class DataExploration():
    
    def __init__(self, data_path = "../../data/processed.parquet"):
        self.data_path = data_path

    def load_data(self):
        self.data = pd.read_parquet(self.data_path)
        return self
    

In [2]:
#dada = pd.read_parquet("../../data/processed.parquet")
#dada.head()

In [30]:
# %%
### prepare
import os
import pandas as pd
import re
from joblib import Parallel, delayed
import pycld2 as cld2
import spacy

class DataProcessing():
    
    def __init__(self,
                data_file = "../../data/ingested.parquet",
                data_folder = "../../data/") -> None:
        self.data_file = data_file
        self.data_folder = data_folder
        pass

    def load_data(self):
        self.data = pd.read_parquet(self.data_file)
        self.data = self.data.iloc[1000:1100,:]
        return self
    
    def _load_text(self, file_path):
        if file_path is None:
            return None
        with open(file_path, "r") as f:
            text = f.read()
        return text

    def _preprocess_text(self, text):
        text = text.lower()
        text = re.sub(r"<.*?>|</.*?>","", text)
        text = re.sub(r"(s?)(f|ht)tp(s?)://\S+\b","", text)
        text = re.sub(r"^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}$","", text) #email
        text = re.sub(r"\\-","", text)
        text = re.sub("[^a-z '.,?!:]"," ", text)
        text = re.sub(r"\b(\w+\s*)\1{1,}", " ", text) #dupli "\\1"
        return re.sub(r" +"," ", text)
    
    def _preprocess_row(self, ind):
        # preprocess
        row = self.data.loc[ind].copy()
        #row["raw_text"] = self._load_text(row["txt_file_destination"])
        row["text"] = self._preprocess_text(
            self._load_text(row["txt_file_destination"]))
        return row
        
    def preprocess_reports(self, n_jobs = 8):
        self.data = self.data.loc[(self.data.txt_file_destination.notnull()),]
        rows_ls = Parallel(n_jobs = n_jobs)(delayed(self._preprocess_row)\
            (ind) for ind in self.data.index)
        self.data = pd.DataFrame(rows_ls)
        return self
    
    def _check_path(self,path):
        if os.path.exists(path):
            return path
        else:
            return None    

    def _get_upos_path(self, file_path,
            dir_name="../../data/txt_files/"):
        base_name = os.path.basename(file_path)
        name, extension = os.path.splitext(base_name)
        return os.path.join(*[dir_name, name+".parquet"])
                
    def _deconstruct_upos_row(self, row, col = "text"):
        nlp = spacy.load("en_core_web_lg")
        nlp.max_length = 20000000
        parsed = nlp(row[col])
        parsed_ls = [(row.name, t.text, t.lemma_, t.pos_, t.tag_, t.dep_,
            t.shape_, t.is_alpha, t.is_stop) for t in parsed]
        return pd.DataFrame(parsed_ls,
            columns=["doc_id","text", "lemma", "pos", "tag",
                "dep", "shape", "is_alpha", "is_stopword"])
    
    def _desconstruct_save_upos_row(self, ind, dir_name, overwrite=False, col = "text"):
        row = self.data.loc[ind,:].copy()
        file_path = self._get_upos_path(row["txt_file_destination"], dir_name)
        if overwrite is False and self._check_path(file_path) is not None:
            row["upos_file_destination"] = file_path
        else:
            upos = self._deconstruct_upos_row(row, col)
            upos.to_parquet(file_path)
            row["upos_file_destination"] = self._check_path(file_path)
        return row
    
    def _load_upos_row(self, ind):
        if self._check_path(self.data.loc[ind,"upos_file_destination"]) is not None:
            return pd.read_parquet(self.data.loc[ind,"upos_file_destination"])
        else:
            return None
    
    def _filter_upos(self, upos):
        # univariate filter
        upos = upos.loc[upos.pos.isin(["NOUN", "ADJ", "VERB"]),:] 
        upos = upos.loc[~upos.is_stopword,:]
        upos = upos.loc[(upos.lemma.str.len()>2) & (upos.lemma.str.len()<19),:]
        # multivariate filter
        lemma_stats = upos.groupby("lemma", as_index=False).agg({"doc_id":["count", "nunique"]})
        pf = (lemma_stats[("doc_id","count")]>1000)&(lemma_stats[("doc_id","nunique")]>500) #500,250
        stopword_set = set([])
        lemma_set = set(lemma_stats.loc[pf,"lemma"].values).difference(stopword_set)
        return upos.loc[upos.lemma.isin(lemma_set),:]

    def _reconstruct_upos(self, upos, col = "reconstructed_text"):
        # reconstruct text
        reconstructed = pd.DataFrame(upos.groupby("doc_id")\
            .apply(lambda x:" ".join(x["lemma"])), columns=[col])
        # clean up
        reconstructed[col] = reconstructed[col].apply(\
            lambda x: re.sub(r'\b(\w+\s*)\1{1,}', '\\1', x))  
        return reconstructed

    def construct_upos(self, n_jobs = 4, dir_name = "../../data/upos_files2/", col = "text"):
        self.data = self.data.loc[(self.data.loc[:,col].notnull())\
            & (~self.data.loc[:,col].isin([""])),]
        # deconstruct and save
        rows_ls = Parallel(n_jobs = n_jobs)\
                (delayed(self._desconstruct_save_upos_row)\
            (ind, dir_name, False) for ind in self.data.index)
        self.data = pd.DataFrame(rows_ls) # NOTE: filter cols
        del rows_ls
        
        # load upos
        upos_ls = [self._load_upos_row(ind) for ind in self.data.index]
        upos = pd.concat(upos_ls)
        del upos_ls
        upos = self._filter_upos(upos)
        # reconstruct text and merge back
        self.data = self.data.merge(self._reconstruct_upos(upos),
            how="inner", left_index=True, right_index=True)   
        return self
    
    def _metadata_row(self, ind, col = "reconstructed_text"):
        row = self.data.loc[ind].copy()
        row["n_chars"] = len(row[col])
        row["n_words"] = len(re.split("\w+",row[col]))
        row["n_sentences"] = len(re.split(r"[.?!]", row[col]))

        lang_estimation = cld2.detect(row[col], returnVectors=True)[2]
        row["language"] = lang_estimation[0][1]
        row["language_score"] = lang_estimation[0][2]/100.0
        return row    
    
    def get_metadata(self, col = "reconstructed_text", n_jobs = 4):
        self.data = self.data.loc[(self.data.loc[:,col].notnull())\
            & (~self.data.loc[:,col].isin([""])),]
        rows_ls = Parallel(n_jobs = n_jobs)(delayed(self._metadata_row)\
            (ind, col) for ind in self.data.index)
        self.data = pd.DataFrame(rows_ls)
        return self
    
    def save_data(self, file_path = None):
        if file_path is None:
            file_path = self.data_folder+"processed.parquet"
        self.data.to_parquet(file_path)
        return self    

# %%
Processing = DataProcessing().load_data().preprocess_reports()#\
    #.construct_upos().get_metadata().save_data()

In [25]:
docs = Processing.data
nlp = spacy.load("en_core_web_lg")
nlp.max_length = 20000000
parsed = nlp.pipe(docs.loc[:,"text"].values, batch_size=10, n_process=8)
parsed_ls = [(docs.index[i], t.text, t.lemma_, t.pos_, t.tag_, t.dep_,
    t.shape_, t.is_alpha, t.is_stop) for i, parsed in enumerate(docs) for t in parsed]

AttributeError: 'str' object has no attribute 'text'

In [32]:
def _get_files(self, dir_name):
    for file in os.listdir(dir_name):
        yield pd.read_parquet(os.path.join(dir_name, file))        

def _get_batch(self, data, batch_size = 80):
    for i in range(0, len(df), batch_size):
        yield data.iloc[i:i+batch_size,:]

def _deconstruct_upos_batch(self, data, col = "text", n_process = 8, batch_size = 10):
    nlp = spacy.load("en_core_web_lg")
    nlp.max_length = 20000000
    docs = nlp.pipe(data.loc[:,col].values,
        n_process=n_process, batch_size=batch_size)
    parsed_ls = [(data.index[i], t.text, t.lemma_, t.pos_, t.tag_, t.dep_,
        t.shape_, t.is_alpha, t.is_stop) for i, parsed in enumerate(docs) for t in parsed]
    return pd.DataFrame(parsed_ls,
        columns=["doc_id","text", "lemma", "pos", "tag",
            "dep", "shape", "is_alpha", "is_stopword"])

def _deconstruct_save_upos_batch(self, data, dir_name, col = "text", n_process = 8, batch_size = 10):
    data_batch = self._get_batch_data(data)
    for i, v in enumerate(data_batch):
        upos = self._deconstruct_upos_batch(data, col, n_process, batch_size)
        upos.to_parquet(dir_name+str(i)+"_batch.parquet")
        del upos
    return None

def _recoconstruct_upos(self, dir_name, col = "reconstructed_text"):
    # reconstruct text
    upos = pd.concat(self._get_files(dir_name)) 
    reconstructed = pd.DataFrame(upos.groupby("doc_id")\
        .apply(lambda x:" ".join(x["lemma"])), columns=[col])
    # clean up
    reconstructed[col] = reconstructed[col].apply(\
        lambda x: re.sub(r'\b(\w+\s*)\1{1,}', '\\1', x))  
    return reconstructed

#df = _deconstruct_upos()

In [33]:

data = Processing.data
data_bach = get_batch(data)

ls = []
while True:
    try:
        ls.append(_deconstruct_upos(next(df)))
    except StopIteration:
        break
df0 = pd.concat(ls)

In [29]:
df0.doc_id.nunique()

100

In [37]:
def get_batch_data(df, batch_size = 80):
    for i in range(0, len(df), batch_size):
        yield df.iloc[i:i+batch_size,:]

def _deconstruct_upos(df, col = "text", n_process = 8, batch_size = 10):
    nlp = spacy.load("en_core_web_lg")
    nlp.max_length = 20000000
    docs = nlp.pipe(df.loc[:,col].values,
        n_process=n_process, batch_size=batch_size)
    parsed_ls = [(df.index[i], t.text, t.lemma_, t.pos_, t.tag_, t.dep_,
        t.shape_, t.is_alpha, t.is_stop) for i, parsed in enumerate(docs) for t in parsed]
    return pd.DataFrame(parsed_ls,
        columns=["doc_id","text", "lemma", "pos", "tag",
            "dep", "shape", "is_alpha", "is_stopword"])

data = Processing.data

data_batch = get_batch_data(data)
for i, v in enumerate(data_batch):
    upos = _deconstruct_upos(v)
    upos.to_parquet("../../data/upos_files/"+str(i)+"_batch.parquet")
    del upos
upos = pd.concat(get_batch_files())    

In [40]:
def get_batch_files(dir_name = "../../data/upos_files/"):
    for file in os.listdir(dir_name):
        yield pd.read_parquet(os.path.join(dir_name, file))

