In [None]:
import pandas as pd
import sys, os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from crimebb import *

In [None]:
# !pip install pyenchant

In [None]:
# !pip install nltk

In [None]:
# !pip install wordcloud

In [None]:
# !pip install pycountry

In [None]:
# !pip install multidict

In [None]:
YEAR="2021"

In [None]:
DATA_PATH="../data/"
CSV_PATH = f"{DATA_PATH}csv/{YEAR}/"

In [None]:
language_to_eval = ["russian", "english", "german"]

### Loading data

#### boards

In [None]:
boards_df = pd.read_csv(f"{CSV_PATH}boards.csv", sep="\t", low_memory=False)
boards_df["url"] = boards_df["url"].apply(lambda x: x.replace("antichat.com", "forum.antichat.ru"))
boards_df["site_name"] = boards_df["url"].apply(lambda x: (x.replace("https://", "")).split("/")[0] if "https" in x else (x.replace("http://", "")).split("/")[0] )
boards_df.drop_duplicates(inplace=True)
boards_df = boards_df[["id", "site_id", "site_name", "name", "url"]].copy().drop_duplicates()
boards_df.rename(columns={"id":"board_id", "name":"board_title", "url":"board_url"}, inplace=True)
boards_df

#### website

In [None]:
website_df = boards_df[["site_name", "site_id"]].copy()
website_df.drop_duplicates(inplace=True)
website_df

#### Threads

In [None]:
threads_df = pd.read_csv(f"{CSV_PATH}threads.csv", sep="\t", low_memory=False)
threads_df["url"] = threads_df["url"].apply(lambda x: x.replace("antichat.com", "forum.antichat.ru"))
threads_df.drop_duplicates(inplace=True)
threads_df = threads_df[["id", "site_id", "board_id", "creator", "creator_id", "name", "url", "created_on"]].copy().drop_duplicates()
threads_df.rename(columns={"creator":"username", "id":"thread_id", "creator_id":"user_id", "name":"thread_title", "url":"thread_url", "created_on": "thread_data_creation"}, inplace=True)
#threads_df = threads_df[ (threads_df["username"].str.lower()!="none") & (~threads_df["username"].isna()) ].copy().drop_duplicates()
threads_df

#### posts

In [None]:
posts_df = pd.read_csv(f"{CSV_PATH}posts.csv", sep="\t", low_memory=False)
posts_df.drop_duplicates(inplace=True)
posts_df = posts_df[["id", "site_id", "board_id", "thread_id", "creator", "creator_id", "creator_n_posts", "creator_reputation", "content", "quoted_post_ids", "created_on"]].copy().drop_duplicates()
posts_df.rename(columns={"creator":"username", "id":"post_id", "creator_id":"user_id", "creator_n_posts":"user_num_posts", "creator_reputation":"user_reputation", "created_on": "post_data_creation"}, inplace=True)
#posts_df = posts_df[ (posts_df["username"].str.lower()!="none") & (~posts_df["username"].isna()) ].copy().drop_duplicates()
posts_df = posts_df[~posts_df["content"].isna()].copy().drop_duplicates()
posts_df

#### posts x threads x boards x website

In [None]:
posts_website_df = pd.merge(posts_df, website_df, how="left", on="site_id")
posts_website_df

In [None]:
posts_boards_df = pd.merge(posts_website_df, boards_df[["site_id", "board_id", "board_title"]].drop_duplicates(), on=["site_id", "board_id"], how="left")
posts_boards_df

In [None]:
posts_threads_df = pd.merge(posts_boards_df, threads_df[["site_id", "board_id", "thread_id", "thread_title"]].drop_duplicates(), on=["site_id", "board_id", "thread_id"], how="left")
posts_threads_df

#### Pre-processing content

In [None]:
posts_threads_df["content_processed"] = posts_threads_df["content"].apply(lambda x: x.replace("https", "") )
posts_threads_df["content_processed"] = posts_threads_df["content_processed"].apply(lambda x: x.replace("http", "") )
posts_threads_df["content_processed"] = posts_threads_df["content_processed"].apply(lambda x: x.replace("...", "") )
posts_threads_df["content_processed"] = posts_threads_df["content_processed"].apply(lambda x: x.replace("..", "") )
posts_threads_df["content_processed"] = posts_threads_df["content_processed"].apply(lambda x: re.sub('[\\\'\"+@_!#$%^&*,;()<>?/\|\[\]}{~:=\n]', " ", x) )
posts_threads_df["content_processed"] = posts_threads_df["content_processed"].apply(str.lower)

#### Detecting languages

In [None]:
posts_threads_df["language_content"] = posts_threads_df["content_processed"].apply(lambda x: detect_language(str(x), language_to_eval=language_to_eval) )

In [None]:
posts_threads_df["language_detected"] = posts_threads_df["language_content"].apply(lambda x: max(x, key=x.get) )

In [None]:
posts_threads_df["language_score"] = posts_threads_df["language_content"].apply(lambda x: max(x.values() ))

In [None]:
posts_threads_df["language_detected"].unique()

In [None]:
posts_threads_df.to_csv(f"{CSV_PATH}post_thread_board.csv", sep='\t', index=False)

#### Languages

In [None]:
lang_post_df = posts_threads_df.pivot_table(columns=['language_detected'], aggfunc='size')
lang_post_df.sort_values(ascending=False, inplace=True)
lang_post_df

In [None]:
mean_val = round(lang_post_df.mean(), 2)

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(16,8))
ax.bar(lang_post_df.index.values, lang_post_df.values)

for x,y,p in zip(lang_post_df.index.values, lang_post_df.values, lang_post_df.values):
  plt.text(x, y, p)
plt.xlabel("Languages detected")
plt.ylabel("Number of documents")
plt.title(f"Number of documents per language, Avg: {mean_val}")
plt.show()

#### Wordclouds

In [None]:
os.exit()

In [None]:
english_df = posts_threads_df[posts_threads_df["language_detected"]=="english"].copy()
english_content = ' '.join(english_df["content_processed"])

In [None]:
german_df = posts_threads_df[posts_threads_df["language_detected"]=="german"].copy()
german_content = ' '.join(german_df["content_processed"])

In [None]:
russian_df = posts_threads_df[posts_threads_df["language_detected"]=="russian"].copy()
russian_content = ' '.join(russian_df["content_processed"])

In [None]:
en_dict = getFrequencyDictForText(english_content, language_to_eval=language_to_eval)

In [None]:
german_dict = getFrequencyDictForText(german_content, language_to_eval=language_to_eval)

In [None]:
ru_dict = getFrequencyDictForText(russian_content, language_to_eval=language_to_eval)

In [None]:
showWordCloud(en_dict)

In [None]:
showWordCloud(german_dict)

In [None]:
showWordCloud(ru_dict)