In [1]:
import pandas as pd
import json
import os

GRAPH_DIR = "./Graphs"

In [2]:
with open(os.path.join(GRAPH_DIR, "tfidf_core_clusts.json")) as curr_f:
    tfidf_core_clusts = json.load(curr_f)

In [3]:
with open(os.path.join(GRAPH_DIR, "pos_tri_core_clusts.json")) as curr_f:
    postri_core_clusts = json.load(curr_f)

In [4]:
with open(os.path.join(GRAPH_DIR, "tfidf_t20_clusts.json")) as curr_f:
    tfidf_t20_clusts = json.load(curr_f)

In [5]:
with open(os.path.join(GRAPH_DIR, "pos_tri_t20_clusts.json")) as curr_f:
    postri_t20_clusts = json.load(curr_f)

In [6]:
core_users = tfidf_core_clusts["Cluster 0"] + tfidf_core_clusts["Cluster 1"]
core_clusters = pd.DataFrame()
core_clusters["bow"] = pd.Series({u: int(u in tfidf_core_clusts["Cluster 1"]) for u in core_users})
core_clusters["pos_tri"] = pd.Series({u: int(u in postri_core_clusts["Cluster 1"]) for u in core_users})

In [7]:
core_clusters

Unnamed: 0,bow,pos_tri
1.0,0,1
2.0,0,0
4.0,0,0
14.0,0,0
16.0,0,0
...,...,...
2022.0,1,1
2023.0,1,1
2105.0,1,0
2114.0,1,1


In [8]:
t20_users = tfidf_t20_clusts["Cluster 0"] + tfidf_t20_clusts["Cluster 1"]
t20_clusters = pd.DataFrame()
t20_clusters["bow"] = pd.Series({u: int(u in tfidf_t20_clusts["Cluster 1"]) for u in t20_users})
t20_clusters["pos_tri"] = pd.Series({u: int(u in postri_t20_clusts["Cluster 1"]) for u in t20_users})

In [9]:
t20_clusters

Unnamed: 0,bow,pos_tri
2.0,0,0
16.0,0,1
32.0,0,0
38.0,0,1
99.0,0,1
464.0,0,0
709.0,0,0
775.0,0,0
843.0,0,0
1206.0,0,0


In [10]:
clust_comparison = pd.crosstab(core_clusters["bow"], [core_clusters["pos_tri"]], rownames=["BoW"], colnames=["PosTri"])

In [11]:
clust_comparison

PosTri,0,1
BoW,Unnamed: 1_level_1,Unnamed: 2_level_1
0,57,112
1,85,43


In [12]:
core_clusters.groupby("bow").size()

bow
0    169
1    128
dtype: int64

In [13]:
core_clusters.groupby("pos_tri").size()

pos_tri
0    142
1    155
dtype: int64

## Looking at Moderators

In [14]:
import sys
sys.path.insert(1, "../")
from settings import TFES_FP as DB_FP

In [15]:
import sqlite3
conn = sqlite3.connect(DB_FP)
positions = pd.read_sql_query("SELECT uid, position FROM users", conn, index_col="uid")
positions = pd.Series(positions["position"])
conn.close()

In [16]:
moderators = positions[~positions.isna()]

In [43]:
moderators.shape

(12,)

In [17]:
moderators

uid
1                Administrator
2             Planar Moderator
3             Planar Moderator
7                Administrator
16                   Purgatory
21            Planar Moderator
37            Planar Moderator
38      Zetetic Council Member
58            Planar Moderator
61      Zetetic Council Member
130                  Purgatory
1804          Planar Moderator
Name: position, dtype: object

In [18]:
is_moderator = (~positions.isna()).apply(int)
is_moderator.index = is_moderator.index.to_series().apply(float)

### For Core Clusters

In [48]:
clust_comparison = pd.crosstab(is_moderator, [core_clusters["bow"]], rownames=["is moderator"], colnames=["BoW Clust"])
clust_comparison

BoW Clust,0,1
is moderator,Unnamed: 1_level_1,Unnamed: 2_level_1
0,161,125
1,8,3


In [47]:
clust_comparison = pd.crosstab(is_moderator, [core_clusters["pos_tri"]], rownames=["is moderator"], colnames=["PoS-Tri Clust"])
clust_comparison

PoS-Tri Clust,0,1
is moderator,Unnamed: 1_level_1,Unnamed: 2_level_1
0,133,153
1,9,2


### For Top 20 Clusters

In [21]:
t20_is_mod = is_moderator.loc[t20_clusters.index]

In [22]:
clust_comparison = pd.crosstab(t20_is_mod, [t20_clusters["bow"]], rownames=["is moderator"], colnames=["BoW Clust"])
clust_comparison

BoW Clust,0,1
is moderator,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7,9
1,4,0


In [23]:
clust_comparison = pd.crosstab(t20_is_mod, [t20_clusters["pos_tri"]], rownames=["is moderator"], colnames=["BoW Clust"])
clust_comparison

BoW Clust,0,1
is moderator,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7,9
1,2,2


# Loading in some DB data so I can print in order of num posts

In [24]:
sys.path.insert(1, "../utilities")
from helpers import flat_earth_boards

In [25]:
DB_FP

'C:/Users/Eddie/Documents/Thesis Code/fe_data/tfes_forum_anon.db'

In [26]:
sql_query ="""
SELECT p.uid as uid, p.user as user, p.time as time, b.uid as board
FROM posts as p
INNER JOIN topics as t
ON t.uid = p.topic
INNER JOIN boards as b
ON b.uid= t.board;""".strip()

In [27]:
import sqlite3
conn = sqlite3.connect(DB_FP)
posts = pd.read_sql_query(sql_query, conn, index_col="uid", parse_dates={"time": "%Y/%m/%d %H:%M:%S"})
conn.close()

In [28]:
fe_posts = posts.query("board in @flat_earth_boards")

In [29]:
fe_posts

Unnamed: 0_level_0,user,time,board
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
40,20.0,2013-12-01 18:43:04,4
192,22.0,2013-12-02 01:43:59,3
195,21.0,2013-12-02 01:47:24,3
198,20.0,2013-12-02 01:59:15,4
203,20.0,2013-12-02 02:37:29,5
...,...,...,...
126194,1832.0,2020-10-08 17:09:56,4
126195,2283.0,2020-10-08 17:27:43,4
126196,2259.0,2020-10-08 19:11:04,4
126199,2283.0,2020-10-08 21:01:07,4


In [30]:
posts_per_user = fe_posts.groupby("user").size().sort_values(ascending=False)

In [31]:
posts_per_user.head(20).index

Float64Index([  38.0,    2.0, 1206.0, 1342.0, 1804.0,   32.0,  315.0,  775.0,
              1584.0,   16.0, 1448.0,   62.0, 1500.0,  706.0, 1423.0,  709.0,
               843.0,  376.0,  464.0,   99.0],
             dtype='float64', name='user')

In [32]:
posts_per_user.head(20).index.isin(t20_users)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [33]:
t20_clusters

Unnamed: 0,bow,pos_tri
2.0,0,0
16.0,0,1
32.0,0,0
38.0,0,1
99.0,0,1
464.0,0,0
709.0,0,0
775.0,0,0
843.0,0,0
1206.0,0,0


In [34]:
clust_table = pd.DataFrame(index=posts_per_user.head(20).index)

In [35]:
clust_table["PoS-Tri"] = t20_clusters["pos_tri"]
clust_table["BoW"] = t20_clusters["bow"]

In [36]:
FE_belief = {38: 'FE', 
             2: 'FE', 
             1206: 'RE', 
             1342: 'RE', 
             1804: 'FE', 
             32: 'UNK', 
             315: 'RE', 
             775: 'RE', 
             1584: 'RE', 
             16: 'RE', 
             1448: 'RE', 
             62: 'RE', 
             1500: 'UNK', 
             706: 'RE', 
             1423: 'FE', 
             709: 'RE', 
             843: 'RE', 
             376: 'RE', 
             464: 'RE', 
             99: 'RE'}

FE_belief = pd.Series(FE_belief)
clust_table["Belief"] = FE_belief

In [40]:
clust_table["Moderator"] = is_moderator.loc[t20_users].apply(bool)

In [41]:
is_moderator.loc[t20_users]

uid
2.0       1
16.0      1
32.0      0
38.0      1
99.0      0
464.0     0
709.0     0
775.0     0
843.0     0
1206.0    0
1804.0    1
62.0      0
315.0     0
376.0     0
706.0     0
1342.0    0
1423.0    0
1448.0    0
1500.0    0
1584.0    0
Name: position, dtype: int64

In [42]:
clust_table

Unnamed: 0_level_0,PoS-Tri,BoW,Belief,Moderator
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
38.0,1,0,FE,True
2.0,0,0,FE,True
1206.0,0,0,RE,False
1342.0,1,1,RE,False
1804.0,0,0,FE,True
32.0,0,0,UNK,False
315.0,1,1,RE,False
775.0,0,0,RE,False
1584.0,1,1,RE,False
16.0,1,0,RE,True


In [45]:
fe_posts[fe_posts["user"].isin(t20_users)].shape[0] / fe_posts.shape[0] * 100

41.830356501372265

In [46]:
posts[posts["user"].isin(t20_users)].shape[0] / posts.shape[0] * 100

31.513470681458006