In [2]:
import pandas as pd

## File Cleaning


[Output Data Model](https://dbdiagram.io/d/64594dd7dca9fb07c4b7c4db)


### Raw Comments


In [8]:
SCRAPED_COMMENTS_S3_URL = (
    "https://austin-schaffer.s3.amazonaws.com/virginia-town-hall/scraped-public-comments/"
    "2022+Virginia+Public+Schools+Model+Policy+Public+Comments.csv"
)
comments_raw = pd.read_csv(SCRAPED_COMMENTS_S3_URL, delimiter="|")

print(f"Length: {len(comments_raw.index)}")
comments_raw.head()

Length: 71297


Unnamed: 0,doc_key,doc_label,doc_title,doc_content,doc_original,doc_date,doc_author
0,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Transgender,I am strongly opposed to any law that would ...,<!DOCTYPE html> <p>I am strongly opposed to ...,10/27/22 3:07 am,Frank McCullough
1,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,"OPPOSE - this will cause actual, real, death o...","These kids are valid in their identities, ta...",<!DOCTYPE html> <p>These kids are valid in t...,10/27/22 1:55 am,an APS student
2,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,"Consequences of Youngkin’s Policy Dire, Immediate",Gov. Youngkin's model policies regarding tra...,<!DOCTYPE html> <p>Gov. Youngkin's model pol...,10/27/22 12:06 am,"Martha Molinaro, George Mason University"
3,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Strongly Oppose,I strongly oppose the non-evidence based pol...,<!DOCTYPE html> <p>I <strong>strongly oppose...,10/27/22 12:04 am,Anonymous (203044)
4,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Opposed,"""Parents"" put a private equity sellout in ch...","<!DOCTYPE html> <p>""Parents"" put a private e...",10/27/22 12:02 am,Anonymous (203041)


In [9]:
name_map = {
    "doc_key": "url",
    "doc_label": "label",
    "doc_title": "title",
    "doc_content": "content",
    "doc_original": "html_raw",
    "doc_date": "created_at",
    "doc_author": "author",
}

comments = comments_raw.rename(columns=name_map)
comments["created_at"] = pd.to_datetime(comments["created_at"])

comments.head()

Unnamed: 0,url,label,title,content,html_raw,created_at,author
0,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Transgender,I am strongly opposed to any law that would ...,<!DOCTYPE html> <p>I am strongly opposed to ...,2022-10-27 03:07:00,Frank McCullough
1,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,"OPPOSE - this will cause actual, real, death o...","These kids are valid in their identities, ta...",<!DOCTYPE html> <p>These kids are valid in t...,2022-10-27 01:55:00,an APS student
2,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,"Consequences of Youngkin’s Policy Dire, Immediate",Gov. Youngkin's model policies regarding tra...,<!DOCTYPE html> <p>Gov. Youngkin's model pol...,2022-10-27 00:06:00,"Martha Molinaro, George Mason University"
3,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Strongly Oppose,I strongly oppose the non-evidence based pol...,<!DOCTYPE html> <p>I <strong>strongly oppose...,2022-10-27 00:04:00,Anonymous (203044)
4,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Opposed,"""Parents"" put a private equity sellout in ch...","<!DOCTYPE html> <p>""Parents"" put a private e...",2022-10-27 00:02:00,Anonymous (203041)


Extract comment IDs from URLs

- Easier to work with than the URLs
- Helpful for de-duping the actual comments (the original file has duplicate rows)


In [10]:
comment_id_regex = (
    r"^https://townhall.virginia.gov/L/viewcomments.cfm\?commentid=(\d+)$"
)
comments["id"] = comments["url"].str.extract(comment_id_regex)

comments.head()

Unnamed: 0,url,label,title,content,html_raw,created_at,author,id
0,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Transgender,I am strongly opposed to any law that would ...,<!DOCTYPE html> <p>I am strongly opposed to ...,2022-10-27 03:07:00,Frank McCullough,203047
1,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,"OPPOSE - this will cause actual, real, death o...","These kids are valid in their identities, ta...",<!DOCTYPE html> <p>These kids are valid in t...,2022-10-27 01:55:00,an APS student,203046
2,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,"Consequences of Youngkin’s Policy Dire, Immediate",Gov. Youngkin's model policies regarding tra...,<!DOCTYPE html> <p>Gov. Youngkin's model pol...,2022-10-27 00:06:00,"Martha Molinaro, George Mason University",203045
3,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Strongly Oppose,I strongly oppose the non-evidence based pol...,<!DOCTYPE html> <p>I <strong>strongly oppose...,2022-10-27 00:04:00,Anonymous (203044),203044
4,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Opposed,"""Parents"" put a private equity sellout in ch...","<!DOCTYPE html> <p>""Parents"" put a private e...",2022-10-27 00:02:00,Anonymous (203041),203041


In [11]:
comments["full_text"] = comments["title"] + " " + comments["content"]
comments["full_text"] = comments["full_text"].replace("\xa0", "", regex=True).fillna("")

comments.head()

Unnamed: 0,url,label,title,content,html_raw,created_at,author,id,full_text
0,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Transgender,I am strongly opposed to any law that would ...,<!DOCTYPE html> <p>I am strongly opposed to ...,2022-10-27 03:07:00,Frank McCullough,203047,Transgender I am strongly opposed to any law...
1,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,"OPPOSE - this will cause actual, real, death o...","These kids are valid in their identities, ta...",<!DOCTYPE html> <p>These kids are valid in t...,2022-10-27 01:55:00,an APS student,203046,"OPPOSE - this will cause actual, real, death o..."
2,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,"Consequences of Youngkin’s Policy Dire, Immediate",Gov. Youngkin's model policies regarding tra...,<!DOCTYPE html> <p>Gov. Youngkin's model pol...,2022-10-27 00:06:00,"Martha Molinaro, George Mason University",203045,"Consequences of Youngkin’s Policy Dire, Immedi..."
3,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Strongly Oppose,I strongly oppose the non-evidence based pol...,<!DOCTYPE html> <p>I <strong>strongly oppose...,2022-10-27 00:04:00,Anonymous (203044),203044,Strongly Oppose I strongly oppose the non-ev...
4,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Opposed,"""Parents"" put a private equity sellout in ch...","<!DOCTYPE html> <p>""Parents"" put a private e...",2022-10-27 00:02:00,Anonymous (203041),203041,"Opposed ""Parents"" put a private equity sello..."


### Duplicate Groups


Load group<->comment map


In [7]:
dupe_group_comments = pd.read_csv("data/group_df.csv", index_col=0)

dupe_group_comments.head()

Unnamed: 0,group_number,index
0,0,34684
1,0,36840
2,0,36832
3,0,36831
4,0,36827


Map the actual comment IDs instead of the random index


In [8]:
dupe_group_comments = pd.merge(
    dupe_group_comments,
    comments,
    left_on="index",
    right_index=True,
)[["group_number", "comment_id"]]

In [9]:
dupe_group_comments = dupe_group_comments.rename(columns={"group_number": "group_id"})

dupe_group_comments.head()

Unnamed: 0,group_id,comment_id
0,0,165765
1,0,163606
2,0,163609
3,0,163613
4,0,163616


Double check the lookups work


In [10]:
comments.set_index("comment_id").loc[
    dupe_group_comments.loc[dupe_group_comments["group_id"] == 3, "comment_id"]
].head(3)

Unnamed: 0_level_0,url,label,title,content,html_raw,created_at,author,full_text
comment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
136829,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,I fully oppose this.,I fully oppose this.,<!DOCTYPE html> <p>I fully oppose this.</p>,2022-09-26 16:55:00,Sidney Liesman,I fully oppose this. I fully oppose this.
130262,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,I fully oppose this.,I fully oppose this.,<!DOCTYPE html> <p>I fully oppose this.</p>,2022-09-26 12:41:00,Miah Leden,I fully oppose this. I fully oppose this.
130264,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,I oppose this fully.,I oppose this fully.,<!DOCTYPE html> <p>I oppose this fully.</p>,2022-09-26 12:42:00,Dayton Korach,I oppose this fully. I oppose this fully.


Attach a sample comment per group


In [11]:
dupe_groups = dupe_group_comments.groupby("group_id", as_index=False).first()

dupe_groups = pd.merge(
    dupe_groups, comments, left_on="comment_id", right_on="comment_id", how="left"
)

dupe_groups = dupe_groups[["group_id", "full_text"]].rename(
    columns={
        "group_id": "id",
        "full_text": "sample_comment_text",
    }
)

dupe_groups.head()

Unnamed: 0,id,sample_comment_text
0,0,Leave Trans Kids Alone This will hurt kids. ...
1,1,Strongly Oppose This will hurt kids. Don't b...
2,2,Youngkin Hates This will hurt kids. Don't be...
3,3,I fully oppose this. I fully oppose this.
4,4,Glenn Youngkin Is A Bot This will hurt kids....


## Deduplicate the Comments

Didn't do this above because some of the groups might map to the duplicates


In [13]:
comments = comments[~comments["id"].duplicated()]

In [14]:
len(comments.index)

71277

### Sentiment Labels


Updated classifications


In [14]:
columns = [
    "doc_key",
    "doc_title",
    "doc_date",
    "doc_author",
    "doc_total",
    "oppose_mean_similarity",
    "support_mean_similarity",
    "ambig_mean_similarity",
    "sentiment_mean_predict",
    "match_title",
    "match_key",
    "match_mean_sentiment",
]

labeled = pd.read_excel(
    "data/labeledcomments_20230508.xlsx",
    usecols=columns,
)

print(f"Length: {len(labeled.index)}")
labeled.head()

Length: 71768


Unnamed: 0,doc_key,doc_title,doc_date,doc_author,doc_total,oppose_mean_similarity,support_mean_similarity,ambig_mean_similarity,sentiment_mean_predict,match_title,match_key,match_mean_sentiment
0,https://townhall.virginia.gov/L/viewcomments.c...,Transgender,10/27/22Â Â 3:07 am,Frank McCullough,Transgender I am strongly opposed to any law ...,0.433871,0.340971,0.225158,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed
1,https://townhall.virginia.gov/L/viewcomments.c...,I strongly oppose this policy,10/26/22Â Â 10:35 pm,Anonymous (202557),I strongly oppose this policy This policy whi...,0.437536,0.307242,0.255222,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed
2,https://townhall.virginia.gov/L/viewcomments.c...,Strongly Oppose,10/26/22Â Â 9:43 pm,Father of trans girl,Strongly Oppose Father of trans girl,0.761039,0.086021,0.15294,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed
3,https://townhall.virginia.gov/L/viewcomments.c...,Strongly oppose,10/26/22Â Â 6:18 pm,Anonymous (201186),Strongly oppose Children should not be able t...,0.468895,0.338169,0.192935,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed
4,https://townhall.virginia.gov/L/viewcomments.c...,SUPPORT 1000% It is NOT kind or LOVING to lie ...,10/26/22Â Â 4:53 pm,Honest parent for truth and biology,SUPPORT 1000% It is NOT kind or LOVING to lie ...,0.296143,0.354949,0.348908,Support,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed


In [15]:
labeled["comment_id"] = labeled["doc_key"].str.extract(comment_id_regex)

labeled.head()

Unnamed: 0,doc_key,doc_title,doc_date,doc_author,doc_total,oppose_mean_similarity,support_mean_similarity,ambig_mean_similarity,sentiment_mean_predict,match_title,match_key,match_mean_sentiment,comment_id
0,https://townhall.virginia.gov/L/viewcomments.c...,Transgender,10/27/22Â Â 3:07 am,Frank McCullough,Transgender I am strongly opposed to any law ...,0.433871,0.340971,0.225158,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed,203047
1,https://townhall.virginia.gov/L/viewcomments.c...,I strongly oppose this policy,10/26/22Â Â 10:35 pm,Anonymous (202557),I strongly oppose this policy This policy whi...,0.437536,0.307242,0.255222,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed,202557
2,https://townhall.virginia.gov/L/viewcomments.c...,Strongly Oppose,10/26/22Â Â 9:43 pm,Father of trans girl,Strongly Oppose Father of trans girl,0.761039,0.086021,0.15294,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed,202191
3,https://townhall.virginia.gov/L/viewcomments.c...,Strongly oppose,10/26/22Â Â 6:18 pm,Anonymous (201186),Strongly oppose Children should not be able t...,0.468895,0.338169,0.192935,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed,201186
4,https://townhall.virginia.gov/L/viewcomments.c...,SUPPORT 1000% It is NOT kind or LOVING to lie ...,10/26/22Â Â 4:53 pm,Honest parent for truth and biology,SUPPORT 1000% It is NOT kind or LOVING to lie ...,0.296143,0.354949,0.348908,Support,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed,200115


Will need to fix!


In [16]:
labeled = labeled[~labeled["comment_id"].isna()]

In [17]:
labeled["comment_id"] = labeled["comment_id"].astype(int)

In [18]:
len(labeled.index)

71297

In [19]:
labeled = labeled[~labeled["comment_id"].duplicated()]

In [20]:
len(labeled.index)

71277

In [21]:
needed_cols = [
    "comment_id",
    "oppose_mean_similarity",
    "support_mean_similarity",
    "ambig_mean_similarity",
    "sentiment_mean_predict",
    "match_title",
    "match_key",
    "match_mean_sentiment",
]

labeled = labeled.loc[:, needed_cols]

labeled.head()

Unnamed: 0,comment_id,oppose_mean_similarity,support_mean_similarity,ambig_mean_similarity,sentiment_mean_predict,match_title,match_key,match_mean_sentiment
0,203047,0.433871,0.340971,0.225158,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed
1,202557,0.437536,0.307242,0.255222,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed
2,202191,0.761039,0.086021,0.15294,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed
3,201186,0.468895,0.338169,0.192935,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed
4,200115,0.296143,0.354949,0.348908,Support,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed


## Write Files


In [25]:
path_prefix = "data/model"

In [22]:
comments = comments[["id", *name_map.values(), "full_text"]]

In [23]:
comments.head(2)

Unnamed: 0,id,url,label,title,content,html_raw,created_at,author,full_text
0,203047,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,Transgender,I am strongly opposed to any law that would ...,<!DOCTYPE html> <p>I am strongly opposed to ...,2022-10-27 03:07:00,Frank McCullough,Transgender I am strongly opposed to any law...
1,203046,https://townhall.virginia.gov/L/viewcomments.c...,not_applicable,"OPPOSE - this will cause actual, real, death o...","These kids are valid in their identities, ta...",<!DOCTYPE html> <p>These kids are valid in t...,2022-10-27 01:55:00,an APS student,"OPPOSE - this will cause actual, real, death o..."


In [26]:
comments.to_csv(
    f"{path_prefix}/comments.csv",
    index=False,
)

In [44]:
dupe_group_comments.head(2)

Unnamed: 0,group_id,comment_id
0,0,165765
1,0,163606


In [45]:
dupe_group_comments.to_csv(
    f"{path_prefix}/dupe_group_comments.csv",
    index=False,
)

In [24]:
dupe_groups.head(2)

Unnamed: 0,id,sample_comment_text
0,0,Leave Trans Kids Alone This will hurt kids. ...
1,1,Strongly Oppose This will hurt kids. Don't b...


In [25]:
dupe_groups.to_csv(
    f"{path_prefix}/dupe_groups.csv",
    index=False,
)

In [48]:
labeled.head(2)

Unnamed: 0,comment_id,oppose_mean_similarity,support_mean_similarity,ambig_mean_similarity,sentiment_mean_predict,match_title,match_key,match_mean_sentiment
0,203047,0.433871,0.340971,0.225158,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed
1,202557,0.437536,0.307242,0.255222,Opposed,Strongly Oppose,https://townhall.virginia.gov/L/viewcomments.c...,Opposed


In [49]:
labeled.to_csv(
    f"{path_prefix}/classifications.csv",
    index=False,
)