In [1]:
import pandas as pd
import sys, os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from crimebb import *

In [3]:
DATA_PATH="../data/"
CSV_PATH = f"{DATA_PATH}csv/"

### Exploring data

#### members

In [None]:
members_df = pd.read_csv(f"{CSV_PATH}members.csv", sep=",", low_memory=False)
members_df.drop_duplicates(inplace=True)
members_df

In [None]:
members_df.loc[1,:]

In [None]:
members_df = members_df[["id", "username", "site_id", "age", "location", "total_posts", "reputation", "prestige"]].copy().drop_duplicates()
members_df = members_df[ (members_df["username"].str.lower()!="none") & (~members_df["username"].isna()) ].copy().drop_duplicates()
members_df.rename(columns={"id":"user_id"}, inplace=True)
members_df

#### boards

In [None]:
boards_df = pd.read_csv(f"{CSV_PATH}boards.csv", sep=",", low_memory=False)
boards_df["url"] = boards_df["url"].apply(lambda x: x.replace("antichat.com", "forum.antichat.ru"))
boards_df["site_name"] = boards_df["url"].apply(lambda x: (x.replace("https://", "")).split("/")[0] if "https" in x else (x.replace("http://", "")).split("/")[0] )
boards_df.drop_duplicates(inplace=True)
boards_df

In [None]:
boards_df.loc[1,:]

In [None]:
boards_df = boards_df[["id", "site_id", "site_name", "name", "url"]].copy().drop_duplicates()
boards_df.rename(columns={"id":"board_id", "name":"board_title"}, inplace=True)
boards_df

#### contracts

In [None]:
contracts_df = pd.read_csv(f"{CSV_PATH}contracts.csv", sep=",", low_memory=False)
contracts_df.drop_duplicates(inplace=True)
contracts_df

#### Threads

In [None]:
threads_df = pd.read_csv(f"{CSV_PATH}threads.csv", sep=",", low_memory=False)
threads_df.drop_duplicates(inplace=True)
threads_df

In [None]:
threads_df.loc[1,:]

In [None]:
threads_df = threads_df[["id", "site_id", "board_id", "creator", "creator_id", "name", "url"]].copy().drop_duplicates()
threads_df.rename(columns={"creator":"username", "id":"thread_id", "creator_id":"user_id", "name":"thread_title"}, inplace=True)
threads_df = threads_df[ (threads_df["username"].str.lower()!="none") & (~threads_df["username"].isna()) ].copy().drop_duplicates()
threads_df

#### Posts

In [None]:
posts_df = pd.read_csv(f"{CSV_PATH}posts.csv", sep=",", low_memory=False)
posts_df.drop_duplicates(inplace=True)
posts_df

In [None]:
posts_df.loc[1,:]

#### votes

In [None]:
votes_df = pd.read_csv(f"{CSV_PATH}votes.csv", sep=",", low_memory=False)
votes_df.drop_duplicates(inplace=True)
votes_df

In [None]:
votes_df.loc[1,:]

### Boards

In [None]:
topics_website_df = pd.pivot_table(boards_df[["site_name", "board_title"]].drop_duplicates(),
                                 index=["site_name"],
                                 values=["board_title"],
                                 aggfunc={
                                     "board_title":len
                                 }
                                ).sort_values(by="board_title", ascending=False).reset_index(level=0)
topics_website_df.set_index('site_name', inplace=True)
topics_website_df

In [None]:
mean_val = topics_website_df["board_title"].mean()

ax = topics_website_df.plot(kind="bar", figsize=(16,8), title=f"Número de topicos por website, Avg: {mean_val}", xlabel="Site name", ylabel="usuarios", rot=45)
# ax.legend(list(temp_df["site_name"]))
ax.get_legend().remove()

### Boards x Members

In [None]:
member_website = pd.merge(members_df[["username", "site_id"]].drop_duplicates(),
                          boards_df[["site_id", "site_name"]].drop_duplicates(),
                          on="site_id", how="left").drop_duplicates()
member_website

##### usermane x website

In [None]:
users_sites_df = pd.pivot_table(member_website,
                 index=["username"],
                 values=["site_name"],
                 aggfunc={
                     "site_name":len
                 }
                ).sort_values(by="site_name", ascending=False).reset_index(level=0)
users_sites_df.rename(columns={"site_name":"num_websites_cadastrado"}, inplace=True)
users_sites_df

In [None]:
users_sites_ = users_sites_df.pivot_table(columns=['num_websites_cadastrado'], aggfunc='size')
users_sites_

In [None]:
ax = users_sites_.plot(kind="bar", figsize=(16,8), title=f"Número de usuarios cadastrado em website, Avg: {users_sites_.mean()}", xlabel="Quantidade de websites cadastrado", ylabel="Num de usuarios", rot=0)
# ax.legend(list(temp_df["site_name"]))
#ax.get_legend().remove()

#### site x usermane

In [None]:
site_users_df = pd.pivot_table(member_website,
                 index=["site_name"],
                 values=["username"],
                 aggfunc={
                     "username":len
                 }
                ).sort_values(by="username", ascending=False).reset_index(level=0)
site_users_df.set_index('site_name', inplace=True)
site_users_df

In [None]:
mean_val = site_users_df["username"].mean()

ax = site_users_df.plot(kind="bar", figsize=(16,8), title=f"Número de usuarios por website, Avg: {mean_val}", xlabel="Site name", ylabel="usuarios", rot=45)
# ax.legend(list(temp_df["site_name"]))
ax.get_legend().remove()

### Threads

#### threads x website

In [None]:
thread_website = pd.merge(threads_df[["thread_id", "site_id"]].drop_duplicates(),
                          boards_df[["site_id", "site_name"]].drop_duplicates(),
                          on="site_id", how="left").drop_duplicates()
thread_website

In [None]:
site_threads_df = pd.pivot_table(thread_website,
                 index=["site_name"],
                 values=["thread_id"],
                 aggfunc={
                     "thread_id":len
                 }
                ).sort_values(by="thread_id", ascending=False).reset_index(level=0)
site_threads_df.set_index('site_name', inplace=True)
site_threads_df

In [None]:
mean_val = site_threads_df["thread_id"].mean()

ax = site_threads_df.plot(kind="bar", figsize=(16,8), title=f"Número de threads por website, Avg: {mean_val}", xlabel="Site name", ylabel="Num threads", rot=45)
# ax.legend(list(temp_df["site_name"]))
ax.get_legend().remove()

### Threads x Boards

In [None]:
os.exit()

In [None]:
threads_df[["thread_id", "site_id", "board_id"]].drop_duplicates()

In [None]:
thread_board_df = pd.pivot_table(threads_df[["thread_id", "site_id", "board_id"]].drop_duplicates(),
                              index=["thread_id", "site_id"],
                              values=["board_id"],
                              aggfunc={
                                 "board_id":len
                              })#.drop_duplicates()
thread_board_df

### Threads x Members

In [None]:
thread_member_df = pd.pivot_table(threads_df[["thread_id", "site_id", "username"]].drop_duplicates(),
                                 index=["username", "site_id"],
                                 values=["thread_id"],
                                 aggfunc={
                                     "thread_id":len
                                 }
                                ).sort_values(by="thread_id", ascending=False)#.reset_index(level=0)
thread_member_df

In [None]:
thread_member_ = thread_member_df.pivot_table(columns=['user_id'], aggfunc='size')
thread_member_