In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Obtain data from Austin's S3 bucket
url = 'https://austin-schaffer.s3.amazonaws.com/virginia-town-hall/scraped-public-comments/2022+Virginia+Public+Schools+Model+Policy+Public+Comments.csv'
comments = pd.read_csv(url, delimiter='|')

In [3]:
# Append the titles and context of the comment together, and remove unicode symbols
comments['doc_total'] = [str(i).replace('\xa0','')+str(j).replace('\xa0','') for i, j in zip(comments['doc_title'], comments['doc_content'])]

In [4]:
# Convert each comment to a TF-IDF vector (see this page for definition of TF-IDF: https://www.capitalone.com/tech/machine-learning/understanding-tf-idf/)
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(comments['doc_total']) 

In [5]:
# Calculate cosine similarity (https://en.wikipedia.org/wiki/Cosine_similarity) between every pairwise combo of comments' TF-IDF vectors. This gives us a symmetric similarity matrix:
sim = tfidf * tfidf.T

In [6]:
# Set a baseline minimum threshold similarity (min=0, max=1, so here let's use .9) to say that two comments are exact matches
# (In other words, if similarity <=.9, set it equal to 0)
sim.data[sim.data < .9] = 0

In [7]:
# Keep only the non-zero similarities, and convert to a dataframe of index-pairs
nonzeros = sim.nonzero()
nonzero_df = pd.DataFrame({'index1': nonzeros[0].tolist(),
                           'index2': nonzeros[1].tolist()})
nonzero_df = nonzero_df.query("index1 != index2")

In [83]:
# Kropko's makeshift community detector algorithm:
# Because while loops are awesome!
# 1) Get the frequency counts for index1, sorted from most to least frequent
# 2) Initialize groups, which will contain a list of lists of indexes of the same repeated comments; 
    # also initialize df to be nonzero_df, and length = # of unique values of index 1
# 3) While the length of the list in (1) is > 0:
# a) take the most frequent remaining value of index 1 (v)
# b) filter df to the rows where index1==v
# c) extract the values of index2 and create a list of these values plus v
# d) add this list to groups
# e) filter out the rows from (b) from df
# f) recalculate the frequencies of index1, and set length equal to the length of this list

vc = nonzero_df['index1'].value_counts()
length = len(vc)
df = nonzero_df
groups = []
while length > 0:
    print(length)
    v = vc.index[0]
    df2 = df.query(f"index1 == {v}")
    group = [v] + df2['index2'].tolist()
    groups = groups + [group]
    df = df.query("index1 not in @group")
    vc = df['index1'].value_counts()
    length = len(vc)

38436
34017
29790
27050
25510
24147
22994
21927
20862
19899
19074
18257
17448
16645
15845
15049
15045
15044
14272
13513
13056
12722
12487
12171
11981
11916
11914
11907
11781
11624
11486
11356
11239
11139
11041
10945
10853
10766
10682
10605
10535
10468
10401
10336
10273
10272
10216
10181
10176
10127
10081
10036
9992
9948
9904
9861
9817
9774
9732
9691
9651
9611
9572
9567
9532
9518
9485
9476
9443
9411
9379
9348
9317
9286
9255
9224
9194
9164
9134
9105
9076
9047
9018
8989
8961
8933
8905
8877
8849
8821
8794
8767
8740
8713
8686
8659
8632
8605
8578
8551
8542
8516
8490
8464
8438
8412
8386
8360
8334
8308
8282
8256
8255
8249
8223
8197
8172
8147
8122
8097
8072
8047
8022
7997
7972
7947
7922
7897
7895
7871
7847
7823
7799
7775
7751
7727
7703
7679
7655
7631
7607
7583
7559
7535
7511
7487
7463
7439
7416
7393
7370
7347
7324
7301
7278
7255
7232
7209
7186
7163
7140
7117
7094
7071
7048
7025
7017
6994
6991
6989
6966
6943
6921
6899
6877
6855
6833
6811
6789
6767
6745
6723
6701
6679
6657
6635
6613
6591
6569
654

In [9]:
# Demonstrating that the code works
comments['doc_total'][groups[0]]
comments['doc_total'][groups[1]]
comments['doc_total'][groups[2]]
comments['doc_total'][groups[3]]

63439    I fully oppose this.  I fully oppose this. 
69982    I fully oppose this.  I fully oppose this. 
69980    I oppose this fully.  I oppose this fully. 
69971    I fully oppose this.  I fully oppose this. 
69966    I oppose this fully.  I oppose this fully. 
                            ...                     
4087     I oppose this fully.  I oppose this fully. 
3769     I oppose this fully.  I oppose this fully. 
1887     I oppose this fully.  I oppose this fully. 
1717     I fully oppose this.  I fully oppose this. 
1548     I oppose this fully.  I oppose this fully. 
Name: doc_total, Length: 1540, dtype: object

In [21]:
comments['doc_total'][groups[1626]]

71174    Strongly Oppose  I strongly oppose Governor Yo...
71178    Strongly oppose  I strongly oppose Governor Yo...
Name: doc_total, dtype: object

In [28]:
print(comments['doc_total'][groups[0][0]])

Leave Trans Kids Alone  This will hurt kids. Don't be mean, Youngkin! 


In [29]:
print(comments['doc_total'][groups[1][0]])

Strongly Oppose  This will hurt kids. Don't be mean, Youngkin! 


[1, 2, 3, 4, 5, 6]


In [84]:
g = []
for h in groups:
    g.extend(h)

In [85]:
len(g)

41788

In [87]:
len(set(g))

38436

In [88]:
len(groups)

1627

1. Make a dataframe of group no. and index
2. Join with the comments
3. Calculate standard deviation of timestamps

In [101]:
i =0
group_df = pd.DataFrame()
for g in groups:
    df = pd.DataFrame({'group_number': i,
                      'index': g})
    group_df = pd.concat([group_df, df])
    i += 1

In [103]:
group_df.to_csv('group_df.csv')

In [111]:
comments['doc_date'] = pd.to_datetime(comments['doc_date'])

In [117]:
ind = group_df.query("group_number==0")['index']

In [131]:
deltas = comments.loc[ind,:].sort_values('doc_date')['doc_date'] - comments.loc[ind,:].sort_values('doc_date')['doc_date'].shift(1)

In [133]:
deltas.max()

Timedelta('0 days 00:03:00')