In [None]:
import pandas as pd
import sys, os

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from crimebb import *

In [None]:
from tqdm.notebook import tqdm

tqdm.pandas()

In [None]:
YEAR="2019"

In [None]:
DATA_PATH="../../data/"
CSV_PATH = f"{DATA_PATH}csv/{YEAR}/summary/"
CSV_PROCESSED = f"{DATA_PATH}csv/{YEAR}/processed/"

In [None]:
verifyDir(CSV_PROCESSED)

### Loading data

In [None]:
crimeBB_data = CrimeBBManager(DATA_PATH, YEAR)

#### Members

In [None]:
crimeBB_data.process_members_2019()

#### Website

In [None]:
crimeBB_data.process_sites_2019()

#### Boards

In [None]:
crimeBB_data.process_boards_2019()

#### Threads

In [None]:
crimeBB_data.process_threads_2019()

#### Posts

In [None]:
crimeBB_data.process_posts_2019()

### Summarizing CrimeBB

In [None]:
website_df = pd.read_csv(f"{CSV_PROCESSED}sites.csv", sep="\t", low_memory=False)
members_df = pd.read_csv(f"{CSV_PROCESSED}members.csv", sep="\t", low_memory=False)
boards_df = pd.read_csv(f"{CSV_PROCESSED}boards.csv", sep="\t", low_memory=False)
threads_df = pd.read_csv(f"{CSV_PROCESSED}threads.csv", sep="\t", low_memory=False)

In [None]:
chunk_size = 1000000

posts_reader = pd.read_csv(f"{CSV_PROCESSED}posts.csv", sep="\t", low_memory=False, iterator=True)
            
posts_df = pd.DataFrame()

len_readed=chunk_size
while len_readed>=chunk_size:
    current_df = posts_reader.get_chunk(chunk_size).copy()
    current_df.drop_duplicates(inplace=True)

    posts_df = pd.concat([posts_df, current_df], ignore_index=True)

    len_readed = current_df.shape[0]

In [None]:
posts_df.info()

#### Merging

In [None]:
posts_threads_df = pd.merge(posts_df, threads_df[["site_id", "board_id", "thread_id", "thread_title"]].drop_duplicates(), on=["site_id", "board_id", "thread_id"], how="left")

In [None]:
posts_threads_boards_df = pd.merge(posts_threads_df, boards_df[["site_id", "site_name", "board_id", "board_title"]].drop_duplicates(), on=["site_id", "board_id"], how="left")

In [None]:
crimebb_df = posts_threads_boards_df[['post_id', 'site_id', 'board_id', 'thread_id', 'user_id', 
                                     'site_name', 'board_title', 'thread_title', 'username', 'content', 
                                     'user_reputation', 'post_data_creation']].copy()

In [None]:
crimebb_df.isnull().any()

In [None]:
crimebb_df

In [None]:
crimebb_df.to_csv(f"{CSV_PROCESSED}crimeBB_{YEAR}.csv", sep='\t', index=False)

### Drawing Missing Values

#### user_id equals None

In [None]:
none_members_users = members_df[ (members_df["username"].str.lower()=="none") | (members_df["username"].isna()) ].copy().drop_duplicates()

In [None]:
none_threads_users = threads_df[ (threads_df["username"].str.lower()=="none") | (threads_df["username"].isna()) ].copy().drop_duplicates()

In [None]:
none_posts_users = posts_df[ (posts_df["username"].str.lower()=="none") | (posts_df["username"].isna()) ].copy().drop_duplicates()

#### thread title None

In [None]:
none_threads_title = threads_df[threads_df["thread_title"].isna()]

In [None]:
none_posts_threads_title = posts_threads_df[posts_threads_df["thread_title"].isna()]

#### posts content None

In [None]:
none_posts_content = posts_threads_df[posts_threads_df["content"].isna()]

#### Draw

In [None]:
import matplotlib.pyplot as plt

# None usernames

legends_users = ["members", "threads", "posts"]
values_users = [len(none_members_users), len(none_threads_users), len(none_posts_users)]

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(legends_users,values_users, label='Null Usernames')
for x,y,p in zip(legends_users, values_users, values_users):
  plt.text(x, y, p)

# None threads title

legends_threads = ["threads ", "posts "]
values_threads = [len(none_threads_title), len(none_posts_threads_title)]

ax.bar(legends_threads, values_threads, label='Null Thread_title')
for x,y,p in zip(legends_threads, values_threads, values_threads):
  plt.text(x, y, p)

# None posts content

legends_posts = ["posts  "]
values_posts = [len(none_posts_content)]

ax.bar(legends_posts, [len(none_posts_content)], label='Null Post content')
for x,y,p in zip(legends_posts, values_posts, values_posts):
  plt.text(x, y, p)

ax.legend()
ax.set_title("Missing values per table")
ax.set_ylabel("Number of missing values")
plt.show()