In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Úprava surových dat: anonymizace

In [None]:
df = pd.read_csv("data/input/respondents.csv", delimiter='|')
df.to_csv('data/respondents.csv', sep='|')
df = df.drop(columns=['first_name', 'last_name'])

### Načtení anonymizovaných dat 

In [None]:
df = pd.read_csv("data/respondents.csv", delimiter='|')
df.head()

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

In [None]:
df[df['descriptions'].str.contains('-') & (~df['descriptions'].str.contains('ex-'))].descriptions

In [None]:
pd.set_option('display.max_colwidth', None)
df[df['descriptions'].str.contains('A-E')]

In [None]:
df[df['descriptions'].str.contains('ID') & (~df.descriptions.str.contains('OM_ID'))]

In [None]:
df[df['descriptions'].str.contains('kand_')]

In [None]:
df[df['descriptions'].str.contains("'")]

In [None]:
import json
import subprocess
zkratky = {'zast_': 'zastupitel ', 'kand_': 'kandidát '}
interpunkcni_z = '",.;:_!?'
stopword_file = 'data/stopwords-cs.json'

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def remove_shortened_words(column: pd.Series, translations: dict = zkratky) -> pd.Series:
    without_shortened = column
    for shorter, full in translations.items():
        without_shortened = without_shortened.str.replace(shorter, full)
    return without_shortened

def remove_interpunctions(column: pd.Series, interpunkcni_z: str = interpunkcni_z, sep: str =' ') -> pd.Series:
    without_interpunction = column
    for letter in interpunkcni_z:
        without_interpunction = without_interpunction.str.replace(letter, sep)
    return without_interpunction

def split_into_words(column: pd.Series, sep=' ') -> pd.Series:
    return column.str.split(sep)

def remove_stop_words(column: pd.Series, filename: str = stopword_file) -> pd.Series:
    with open(filename, 'r') as fd:
        stop_words = json.load(fd)
        return column.apply(lambda words: [word for word in words if word not in stop_words])
def remove_empty(column: pd.Series) -> pd.Series:
    return column.apply(lambda words: [word for word in words if word != ''])
def remove_numbers(column: pd.Series) -> pd.Series:
    return column.apply(lambda words: [word for word in words if not has_numbers(word)])

def lemmatizate_word(word: str) -> str:
    word = bytes(word, encoding='utf-8')
    p = subprocess.Popen("./majka -f majka.w-lt| head -n1 | cut -d ':' -f1",
                         shell=True, stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         stdin=subprocess.PIPE)
    lemma, _ = p.communicate(input=word)
    lemma = lemma.strip(b'\n')
    if len(lemma) == 0:
        lemma = word
    return lemma.decode(encoding='utf-8')

def lemmatizate_words(words: list) -> list:
    lemmatized = []
    for word in words:
        lemmatized.append(lemmatizate_word(word))
    return lemmatized

def lemmatizate(column: pd.Series) -> pd.Series:    
    return column.apply(lemmatizate_words)

def apply_pipeline(column: pd.Series, pipeline: list) -> pd.Series:
    processed = column.copy()
    for func in pipeline:
        processed = func(processed)
    return processed

def lower(column: pd.Series) -> pd.Series:
    return column.str.lower()

In [None]:
from pandarallel import pandarallel
pipeline = [lower, remove_shortened_words, remove_interpunctions, split_into_words, remove_empty, remove_numbers, remove_stop_words]

import time
start = time.process_time()

df['descr_procc'] = apply_pipeline(df.descriptions, pipeline)
print(f'{(time.process_time() - start)/60} min')

pandarallel.initialize()
df['descr_procc'] = df['descr_procc'].parallel_apply(lemmatizate_words)
# your code here    
print(f'{(time.process_time() - start)/60} min')

In [None]:
df.to_csv('data/proccessed.csv')
df.head()