In [None]:
import pandas as pd
import sys, os

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from crimebb import *

In [None]:
from tqdm.notebook import tqdm

tqdm.pandas()

In [None]:
YEAR="2021"

In [None]:
DATA_PATH="../../data/"
CSV_PATH = f"{DATA_PATH}csv/{YEAR}/summary/"
CSV_PROCESSED = f"{DATA_PATH}csv/{YEAR}/processed/"

In [None]:
verifyDir(CSV_PROCESSED)

### Loading and Analyzing Language

In [None]:
language_to_eval = ["russian", "german", "english"]

In [None]:
chunk_size = 1000000

crime_reader = pd.read_csv(f"{CSV_PROCESSED}crimeBB_{YEAR}.csv", sep="\t", low_memory=False, iterator=True)
            
language_df = pd.DataFrame()

len_readed=chunk_size
while len_readed>=chunk_size:
    current_df = crime_reader.get_chunk(chunk_size).copy()
    current_df.drop_duplicates(inplace=True)
    
    content_df = current_df[["post_id", "site_id", "board_id", "thread_id", "content"]].copy()
    content_df = content_df[~content_df["content"].isna()].copy().drop_duplicates()
    
    os.exit()
    break
    
    # content_df["content_processed"] = content_df["content"].apply(lambda x: re.sub('[\\\\\'\"+@_!#$%^&*,;().<>?/\|\[\]}{~:=\n]', " ", x) )
    # content_df["content_processed"] = content_df["content_processed"].apply(str.lower)
    content_df["lang_ratio"] = np.nan
    content_df["language_detected"] = np.nan
    content_df["lang_correct_words"] = np.nan
    content_df["len_correct_words"] = np.nan
    content_df["lang_incorrect_words"] = np.nan
    content_df["len_incorrect_words"] = np.nan

    language_df = pd.concat([language_df, content_df], ignore_index=True)

    len_readed = current_df.shape[0]

In [None]:
crimeBB_df = pd.read_csv(f"{CSV_PROCESSED}crimeBB_{YEAR}.csv", sep="\t", low_memory=False)

In [None]:
crimeBB_df

#### Detecting languages

In [None]:
if verifyFile(f"{CSV_PROCESSED}content_languages.csv"):
    content_df = pd.read_csv(f"{CSV_PROCESSED}content_languages.csv", sep="\t", low_memory=False)
    list_post = content_df[~content_df["language_detected"].isna()]["post_id"].values
else:
    content_df = crimeBB_df[["post_id", "site_id", "board_id", "thread_id", "content"]].copy()
    content_df = content_df[~content_df["content"].isna()].copy().drop_duplicates()
    # content_df["content_processed"] = content_df["content"].apply(lambda x: re.sub('[\\\\\'\"+@_!#$%^&*,;().<>?/\|\[\]}{~:=\n]', " ", x) )
    # content_df["content_processed"] = content_df["content_processed"].apply(str.lower)
    content_df["lang_ratio"] = np.nan
    content_df["language_detected"] = np.nan
    content_df["lang_correct_words"] = np.nan
    content_df["len_correct_words"] = np.nan
    content_df["lang_incorrect_words"] = np.nan
    content_df["len_incorrect_words"] = np.nan
    list_post = []

In [None]:
content_df

In [None]:
count = 1
for index, row in tqdm(content_df.iterrows()):
    
    lang_ratio, lang_detected, lang_badwords, lang_goodwords = detect_language_and_words(row["content"], language_to_eval=language_to_eval)
    
    content_df.loc[index, "lang_ratio"] = [lang_ratio]
    content_df.loc[index, "language_detected"] = lang_detected
    #content_df.loc[(d_manejo_key['TpRem'] == 'MANEJO'), "language_detected"] = lang_detected
    content_df.loc[index, "lang_correct_words"] = {"_":lang_goodwords}
    content_df.loc[index, "len_correct_words"] = len(lang_goodwords)
    content_df.loc[index, "lang_incorrect_words"] = {"_":lang_badwords}
    content_df.loc[index, "len_incorrect_words"] = len(lang_badwords)

    if count%10000==0: # grava cada 10 k muestras
        content_df.to_csv(f"{CSV_PROCESSED}content_languages.csv", sep='\t', index=False)
    
    count+=1
    

In [None]:
content_df.to_csv(f"{CSV_PROCESSED}content_languages.csv", sep='\t', index=False)

### Languages

In [None]:
lang_post_df = content_df.pivot_table(columns=['language_detected'], aggfunc='size')
lang_post_df.sort_values(ascending=False, inplace=True)
lang_post_df

In [None]:
mean_val = round(lang_post_df.mean(), 2)

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(16,8))
ax.bar(lang_post_df.index.values, lang_post_df.values)

for x,y,p in zip(lang_post_df.index.values, lang_post_df.values, lang_post_df.values):
  plt.text(x, y, p)
plt.xlabel("Languages detected")
plt.ylabel("Number of documents")
plt.title(f"Number of documents per language, Avg: {mean_val}")
plt.show()

#### Russian

In [None]:
russian_df = content_df[content_df["language_detected"]=="russian"].copy()
russian_df.info(), russian_df.shape, russian_df.iloc[0,:]

In [None]:
russian_content = ' '.join(russian_df["content"])

In [None]:
russian_dict = get_text_frequency(russian_content)
len(russian_dict)

In [None]:
showWordCloud(russian_dict, f"Russian-Wordcloud", max_words=int(len(russian_dict)))

#### English

In [None]:
english_df = content_df[content_df["language_detected"]=="english"].copy()
english_df.info(), english_df.shape, english_df.iloc[0,:]

In [None]:
english_content = ' '.join(english_df["content"])

In [None]:
english_dict = get_text_frequency(english_content)
len(english_dict)

In [None]:
showWordCloud(english_dict, f"English-Wordcloud", max_words=int(len(english_dict)))

#### German

In [None]:
german_df = content_df[content_df["language_detected"]=="german"].copy()
german_df.info(), german_df.shape, german_df.iloc[0,:]

In [None]:
german_content = ' '.join(german_df["content"])

In [None]:
german_dict = get_text_frequency(german_content)
len(german_dict)

In [None]:
showWordCloud(german_dict, f"German-Wordcloud", max_words=int(len(german_dict)))