In [1]:
import pandas as pd
import plotly.express as px
import math
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from typing import Dict, List

In [93]:
TRAIN_PATH = "../data/raw/train2.tsv"
VAL_PATH = "../data/raw/val2.tsv"
TEST_PATH = "../data/raw/test2.tsv"

In [94]:
train_df = pd.read_csv(TRAIN_PATH, sep="\t", names=["id", "statement_json", "label", "statement", "subject", "speaker", "speaker_title", "state_info", "party_affiliation", "barely_true_count", "false_count", "half_true_count", "mostly_true_count", "pants_fire_count", "context", "justification"])
val_df = pd.read_csv(VAL_PATH, sep="\t", names=["id", "statement_json", "label", "statement", "subject", "speaker", "speaker_title", "state_info", "party_affiliation", "barely_true_count", "false_count", "half_true_count", "mostly_true_count", "pants_fire_count", "context", "justification"])
test_df = pd.read_csv(TEST_PATH, sep="\t", names=["id", "statement_json", "label", "statement", "subject", "speaker", "speaker_title", "state_info", "party_affiliation", "barely_true_count", "false_count", "half_true_count", "mostly_true_count", "pants_fire_count", "context", "justification"])

In [95]:
pd.options.display.max_colwidth = 500
pd.options.display.max_rows = 500

# Basic statistics

In [96]:
len(train_df)

10280

In [6]:
len(val_df)

1284

In [7]:
len(test_df)

1283

In [8]:
train_df.columns

Index(['id', 'statement_json', 'label', 'statement', 'subject', 'speaker',
       'speaker_title', 'state_info', 'party_affiliation', 'barely_true_count',
       'false_count', 'half_true_count', 'mostly_true_count',
       'pants_fire_count', 'context', 'justification'],
      dtype='object')

In [9]:
train_df.head()

Unnamed: 0,id,statement_json,label,statement,subject,speaker,speaker_title,state_info,party_affiliation,barely_true_count,false_count,half_true_count,mostly_true_count,pants_fire_count,context,justification
0,0.0,2635.json,false,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,"That's a premise that he fails to back up. Annie's List makes no bones about being comfortable with candidates who oppose further restrictions on late-term abortions. Then again, this year its backing two House candidates who voted for more limits."
1,1.0,10540.json,half-true,When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started when natural gas took off That started to begin in President (George W. ) Bushs administration. ""No doubt, natural gas has been gaining ground on coal in generating electricity. The trend started in the 1990s but clearly gained speed during the Bush administration when the production of natural gas -- a competitor of coal -- picked up. But analysts give little credit or blame to Bush for that trend. They note that other factors, such as technologic..."
2,2.0,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by voting to give George Bush the benefit of the doubt on Iran.""",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,"Obama said he would have voted against the amendment if he had been present. So though Clinton may have ""agreed"" with McCain on the issue, they did not technically vote the same way on it. To say that voting for Kyl-Lieberman is ""giving George Bush the benefit of the doubt on Iran"" remains a contentious issue. But Obama's main point is that Clinton and McCain were on the same side, and that is correct."
3,3.0,1123.json,false,Health care reform legislation is likely to mandate free sex change surgeries.,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,"The release may have a point that Mikulskis comment could open the door to ""medically necessary"" coverage which conceivably may include sex-change operations. But it's unclear whether her amendment will remain in the legislation, and there's nothing specific in the legislation on sex-change procedures and nothing else solid that indicates such coverage will be provided. The news release cherry-picked a few fleeting references to gender and sexual orientation in completely unrelated contexts ..."
4,4.0,9028.json,half-true,The economic turnaround started at the end of my term.,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround started at the end of my term. ""During Crists last year in office, Floridas economy experienced notable gains in personal income and industrial production, and more marginal improvements in the unemployment rate and in payroll employment. But GDP didnt grow again until Scott took office. Economists say Crist deserves some credit for the economic turnaround because he accepted federal stimulus dollars, but they add that any state is inevitably buffeted..."


In [10]:
# Drop rows with no label
train_df.dropna(subset=["label"], inplace=True)
len(train_df)

10268

In [11]:
# Normalized distribution of labels (roughly equal except for the flagrantly false statement "pants-fire")
train_df.label.value_counts(normalize=True)

half-true      0.206661
false          0.194585
mostly-true    0.191469
true           0.163907
barely-true    0.161375
pants-fire     0.082002
Name: label, dtype: float64

In [103]:
label_ratios = train_df.label.value_counts(normalize=True)
px.bar(label_ratios, x=label_ratios.index, y=label_ratios.values, labels={"index": "label", "y": "ratios"}, title="Label Distribution")

In [12]:
# Notice a huge number of speaker titles
train_df.speaker_title.nunique()

1187

In [13]:
train_df.speaker_title[train_df.speaker_title.notnull()]

0                                 State representative
1                                       State delegate
2                                            President
5                           Wisconsin Assembly speaker
7                                            President
                             ...                      
10268                                  President-Elect
10270                                          Senator
10271                      State Senator, 8th District
10272                      Senior editor, The Atlantic
10279    chairman of the Republican National Committee
Name: speaker_title, Length: 7367, dtype: object

In [14]:
# A lot of repetition in speaker_title - not canonicalized
train_df.speaker_title.value_counts()[:20]

President                        497
U.S. Senator                     480
Governor                         391
President-Elect                  274
U.S. senator                     263
Presidential candidate           254
Former governor                  180
U.S. Representative              172
Milwaukee County Executive       150
Senator                          148
State Senator                    108
U.S. representative              103
U.S. House of Representatives    102
Attorney                          81
Congressman                       80
Social media posting              78
Governor of New Jersey            78
Co-host on CNN's "Crossfire"      73
State Representative              73
State representative              66
Name: speaker_title, dtype: int64

In [15]:
train_df.speaker.value_counts()

barack-obama                   493
donald-trump                   274
hillary-clinton                239
mitt-romney                    180
scott-walker                   150
                              ... 
marilinda-garcia                 1
linda-finn                       1
freedom-religion-foundation      1
joe-sanfelippo                   1
dustin-inman-society             1
Name: speaker, Length: 2915, dtype: int64

In [16]:
train_df.speaker.nunique()

2915

In [99]:
affiliation_counts = train_df.party_affiliation.value_counts()
px.bar(affiliation_counts, x=affiliation_counts.index, y=affiliation_counts.values, labels={"index": "affiliation", "y": "counts"}, title="Counts Per Affiliation")

In [18]:
# Convert from 6-way scale to binary scale
def get_binary_label(label: str) -> bool:
    if label in {"pants-fire", "barely-true", "false"}:
        return False
    elif label in {"true", "half-true", "mostly-true"}:
        return True

In [19]:
train_df["binary_label"] = train_df.label.apply(get_binary_label)

In [20]:
party_groups = train_df.groupby(["party_affiliation"])

In [21]:
party_groups.get_group("republican").binary_label.value_counts(normalize=True)

True     0.502329
False    0.497671
Name: binary_label, dtype: float64

In [22]:
party_groups.get_group("democrat").binary_label.value_counts(normalize=True)

True     0.661584
False    0.338416
Name: binary_label, dtype: float64

In [23]:
train_df.binary_label.value_counts(normalize=True)

True     0.562037
False    0.437963
Name: binary_label, dtype: float64

In [24]:
unigram_lens = train_df.statement.str.split().str.len()

In [100]:
px.histogram(unigram_lens, x=unigram_lens.values, labels={"x": "unigram lens"}, title="Unigram Length Distribution")

In [26]:
unigram_lens.median()

17.0

In [27]:
unigram_lens.mean()

17.909329957148422

In [28]:
unigram_lens.max()

66

In [29]:
# Ran into some noisy labels for certain columns so have to remove it
train_df[train_df.pants_fire_count == "a television interview"]

Unnamed: 0,id,statement_json,label,statement,subject,speaker,speaker_title,state_info,party_affiliation,barely_true_count,false_count,half_true_count,mostly_true_count,pants_fire_count,context,justification,binary_label
6134,6132.0,1993.json,True,Georgia has the most restrictive ballot access laws in the country.\telections,mary-n,,,independent,0,0.0,0.0,0.0,0.0,a television interview,,,True


In [30]:
# Drop column with invalid type for `pants_fire`
train_df.drop(6134, inplace=True)

In [31]:
# Separate true samples from false ones
true_ex = train_df[train_df.binary_label == True]
false_ex = train_df[train_df.binary_label == False]

In [32]:
train_df.barely_true_count.describe()

count    10267.000000
mean        11.562287
std         19.007670
min          0.000000
25%          0.000000
50%          2.000000
75%         12.000000
max         70.000000
Name: barely_true_count, dtype: float64

In [33]:
# TODO (mihail): Include feature for credit history counts (binned)
barely_true_counts = train_df.barely_true_count.value_counts().sort_index()
px.bar(barely_true_counts, x=barely_true_counts.index, y=barely_true_counts.values, labels={"index": "credit", "y": "counts"}, title="Barely True Credit Distribution")

In [34]:
px.histogram(train_df, x="barely_true_count", labels={"x": "credit score"}, title="Barely True Credit Histogram", nbins=10)

In [35]:
barely_true_counts.values

array([3032, 1516,  817,  490,  236,  317,  190,  237,  171,  247,  104,
        289,  112,   50,  115,   70,   69,   63,  135,   56,  150,  115,
        142,  148,  117,  180,   93,  239,  274,  493])

In [36]:
train_df.false_count.describe()

count    10267.000000
mean        13.316353
std         24.142271
min          0.000000
25%          0.000000
50%          2.000000
75%         15.000000
max        114.000000
Name: false_count, dtype: float64

In [37]:
train_df.half_true_count.describe()

count    10267.000000
mean        17.198987
std         35.950911
min          0.000000
25%          0.000000
50%          3.000000
75%         13.000000
max        160.000000
Name: half_true_count, dtype: float64

In [38]:
train_df.mostly_true_count.describe()

count    10267.000000
mean        16.493718
std         36.254053
min          0.000000
25%          0.000000
50%          3.000000
75%         11.000000
max        163.000000
Name: mostly_true_count, dtype: float64

In [39]:
train_df.pants_fire_count.describe()

count     10267
unique       27
top         0.0
freq       4769
Name: pants_fire_count, dtype: object

In [40]:
train_df.pants_fire_count.astype(float).describe()

count    10267.000000
mean         6.203954
std         16.119038
min          0.000000
25%          0.000000
50%          1.000000
75%          5.000000
max        105.000000
Name: pants_fire_count, dtype: float64

In [41]:
true_ex.statement.str.split().str.len().describe()

count    5770.000000
mean       18.337782
std         7.798941
min         2.000000
25%        13.000000
50%        17.000000
75%        23.000000
max        66.000000
Name: statement, dtype: float64

In [42]:
false_ex.statement.str.split().str.len().describe()

count    4497.000000
mean       17.360907
std         7.667368
min         2.000000
25%        12.000000
50%        16.000000
75%        22.000000
max        60.000000
Name: statement, dtype: float64

In [43]:
# Sample true and false examples to observe characteristics
true_ex.sample(frac=0.2).head(25)

Unnamed: 0,id,statement_json,label,statement,subject,speaker,speaker_title,state_info,party_affiliation,barely_true_count,false_count,half_true_count,mostly_true_count,pants_fire_count,context,justification,binary_label
6894,6891.0,4168.json,mostly-true,"The average 401(k) in America of a person whos 60 years old is under $100,000.","economy,labor,retirement,state-finances",gina-raimondo,venture capital company founder,Rhode Island,democrat,2.0,1.0,2.0,5.0,1.0,an appearance on 10 News Conference,"Goodlatte said, ""Before taking executive action on immigration, President Obama stated 22 times that he does not have the authority to change immigration laws on his own. ""Records offered by Goodlatte and other Republicans show Obama repeatedly has made such statements. But the congressman goes a little off course in trying to quantify the times the president has said so. In a handful of the instances, the president was talking in general terms about executive authority that he did not relat...",True
366,366.0,11510.json,true,"Electric car sales in Georgia have dropped dramatically since a $5,000 tax credit was eliminated and a $200 annual registration fee was imposed July 1.","state-budget,taxes,transportation",don-francis,coordinator of the Clean Cities-Georgia Coalition,,none,0.0,0.0,0.0,0.0,0.0,an interview with Watchdog.org,"That statement is backed up by data available so far on new car registrations and new car sales. Keep in mind that car sales spiked as people rushed to buy before the credit expired July 1, and that makes the post-July 1 drop in sales look that much more shocking. Whether that trend holds remains to be seen. But everything we found so far indicates Francis is on the money.",True
8048,8037.0,8767.json,mostly-true,"If you make the average amount of people in Wisconsin, $50,000, you got $1.60 less a week in taxes under the state income-tax cut, but it didnt show up in your paycheck.","state-budget,taxes",kathleen-vinehout,,,democrat,1.0,1.0,1.0,1.0,0.0,a speech,"Sanders told Tapper that his campaign has released his past tax returns. Sanders said his 2015 return is forthcoming, and he has made other types of disclosures in the past, including congressional and presidential-candidate filings. But his releases of prior-year tax forms -- the specific documents Tapper asked about -- have been limited in several ways. Hes only released information for one year, which pales compared to most other recent presidential candidates, and even that years release...",True
3972,3970.0,8875.json,mostly-true,Says 85 percent of IDEA charter-school students come from low-income families and about 99 percent of its graduates go to college.,"children,education",greg-abbott,governor,Texas,republican,11.0,3.0,8.0,6.0,5.0,as quoted during a school visit,"According to the EPA, it is. While carbon dioxide is good for plants, increased amounts of it in our atmosphere will have auxiliary effects that are decidedly bad for ecosystems.",True
2376,2374.0,4746.json,true,"Every day in the United States, we are losing 15 factories.","economy,jobs,taxes",betty-sutton,U.S. representative from Ohio's 13th District,Ohio,democrat,0.0,2.0,1.0,2.0,1.0,a speech in the House of Representatives,"As his benchmark, Santorum chose defense spending as a percentage of the federal budget. He could have chosen other measures, such as defense spending as a percentage of GDP or annual spending in inflation-adjusted dollars, and they would have showed different patterns. But we think the measurement is credible, and his choice of it is reasonable.",True
10046,10035.0,12978.json,mostly-true,"Says Donald Trump himself contributed $100,000 to the Clinton Foundation.",candidates-biography,david-plouffe,Senior Adviser to the President,,democrat,1.0,0.0,0.0,2.0,0.0,an interview on Meet the Press,"Lynne Derbyshire, arguing against arming campus police in Rhode Island, said, ""If you are black or brown, you are nine times more likely to be stopped and frisked"" by police in New York City. The context in which she made her statement suggests that such racial profiling is a threat for students here. We don't believe that population-wide data from the nation's largest city, which has had an aggressive stop and frisk policy for more than a decade, are particularly relevant to a debate involv...",True
1153,1153.0,6442.json,true,Georgia spends fewer tourism advertising dollars than any other state in the South.,"state-budget,tourism",ron-stephens,State Representative,Georgia,republican,0.0,0.0,0.0,0.0,0.0,an article,"Our ruling: Stephens said that Georgias state government spends fewer tourism advertising dollars than any other in the South. He was right in the metrics that matter most: per capita spending, and spending as a percentage of state budget. In dollars, Stephens was close.",True
6744,6741.0,11209.json,mostly-true,Peachtree and Pine is one of the leading sites for tuberculosis in the nation.,public-health,kasim-reed,,,democrat,1.0,0.0,5.0,7.0,1.0,a speech at the Commerce Club,"Obama said todays deficit is ""below the average deficits over the past 40 years. ""Hes correct when measuring the deficit as a percentage of GDP, which is the yardstick most economists tend to use when making deficit comparisons over long periods of time. Still, Obama didnt specify that he was talking about deficits as a percentage of GDP, and the comparisons accuracy depends on the absence of major budgetary and economic changes between now and time the fiscal 2015 data are finalized.",True
9990,9979.0,1559.json,half-true,Texas is the fastest growing state with the most Fortune 500 companies.,job-accomplishments,rick-perry,Governor,Texas,republican,30.0,30.0,42.0,23.0,18.0,a Web ad,"Haridopolos said incumbent victories aren't necessarily tied to redistricting. The winners could simply have run better campaigns or enjoyed higher name recognition, he said. But our concern is the statement in question, which has to do with statistics, not cause and effect. Fair Districts Florida got it right.",True
9529,9518.0,1135.json,true,Health care reform will not affect veterans' benefits.,"health-care,veterans",david-axelrod,,,democrat,2.0,1.0,6.0,7.0,0.0,in a chain e-mail,"Cruz said that ""under Barack Obama and the big government economy, the median wage for women has dropped $733. ""Wages actually have risen for women since Obama took office. Cruz's spokesperson said Cruz intended to use a different phrase. But at PolitiFact, we say words matter.",True


In [44]:
false_ex.sample(frac=0.2).head(25)

Unnamed: 0,id,statement_json,label,statement,subject,speaker,speaker_title,state_info,party_affiliation,barely_true_count,false_count,half_true_count,mostly_true_count,pants_fire_count,context,justification,binary_label
7904,7893.0,7297.json,false,"Under President Barack Obama, 8.3 (million) fewer Americans are working today than there were four years ago.","economy,jobs,pundits",sean-hannity,Radio/TV host,,none,3.0,5.0,3.0,2.0,1.0,his Fox News show,"Is it possible that ACORN and Project Vote were going about the same business in Illinois in 1992?Both groups have the stated goal of getting people to register to vote, especially low-income, minority and young voters.",False
5859,5857.0,3747.json,false,"George Allen had 40,000 earmarks while he was a senator.",federal-budget,jamie-radtke,,Virginia,republican,1.0,1.0,1.0,2.0,0.0,a TV interview.,"Still, its inspector admittedly slipped up in this case. That could have prevented the 2009 illnesses and deaths.",False
9209,9198.0,7762.json,false,On an early date for Floridas presidential primary,elections,marco-rubio,U.S. Senator,Florida,republican,33.0,24.0,32.0,35.0,5.0,a position on legislation,"""Between 1999 and 2001, our human agent base against the terrorist target grew by over 50 percent,"" he said. Clinton did reduce the intelligence budget early in his tenure. But Tenet never placed the blame squarely on Clinton's shoulders, as Giuliani said he did.",False
1826,1826.0,5894.json,false,On support for gay marriage.,"civil-rights,families,gays-and-lesbians,marriage",barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,a TV interview,"Obama said the Justice Department has ""taken on more than 100 voting rights cases since 2009. ""Obamas statement was somewhat vague, because he didnt explain during his speech what he meant by ""taken on"" cases. Some listeners could have interpreted that to mean all cases in which the department was the plaintiff. That's not the case though; the Justice Department has a list of 102 cases that include instances in which the department is the plaintiff, the defendant or played some other role.",False
8770,8759.0,2423.json,barely-true,The size of our federal government has more than doubled since the year 2000.,federal-budget,scott-bruun,State representative,Oregon,republican,2.0,1.0,0.0,0.0,0.0,a statement on his website,"Austins school calendar has about 34 separate state-related testing dates, not 64. Also, many of the state tests are penciled in for the same dates. In addition, significantly, no students are taking tests on all 34 dates or, really, any number of dates close to that. According to the district calendar, state-mandated testing was a possibility on at least one day in six of the 10 months from mid-October 2011 through mid-July 2012.",False
1485,1485.0,4482.json,pants-fire,"Says Rick Perry is spending more money than the state takes in, covering his deficits with record borrowing.","deficit,state-budget",keep-conservatives-united,Super-PAC,North Carolina,republican,1.0,0.0,0.0,0.0,1.0,an online video ad.,"Cruz said that Ukraine agreed to give up the nuclear weapons on its territory because the United States promised to ""ensure its territorial integrity. ""A diplomat with detailed knowledge of the American position noted that the United States went to pains to avoid making a strong guarantee to protect Ukraines borders. The formal memorandum of security assurances lacks a promise to ensure that Ukraines borders remain unchanged. The United States only promised to respect those borders itself, a...",False
1514,1514.0,5798.json,barely-true,"The Wisconsin law repealed by Republicans, which allowed discriminated workers to sue in state court, was kind of a gravy train for lawyers.","civil-rights,disability,legal-issues,women,workers",scott-walker,Milwaukee County Executive,Wisconsin,republican,26.0,41.0,32.0,40.0,11.0,an interview,"Cain said the 9-9-9 plan ""does not raise taxes on those that are making the least. ""But it would raise income taxes on people who now have low tax burdens due to exemptions and deductions. The Tax Policy Center analysis adds more detail and found that high percentages of lower-income tax filers would see tax increases.",False
9108,9097.0,11003.json,false,Lincoln was fervently making plans to send all freed slaves to the jungles of Central America once the (Civil War) was over. The only thing that kept this from happening was his assassination.,"diversity,history",blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,posts on the Internet,"We acknowledge that there are a range of opinions on this matter, but it has become clearer since our prior attempts to tackle this question that the ratings agencies are willing to overlook any defaults beyond those involving bondholders when they measure the nation's creditworthiness. By virtue of their position, the ratings agencies carry significant weight on this question. Still, we feel Rubio wrongly downplayed the likelihood of technical defaults on other creditors, such as federal co...",False
9929,9918.0,10136.json,barely-true,"Rep. Carol Shea-Porter votes with Nancy Pelosis Democrats 95 percent of the time, but Frank Guinta will take on both parties and has independent New Hampshire values.",voting-record,national-republican-congressional-committee,,,republican,18.0,9.0,8.0,5.0,8.0,an ad,"Obamas own writings, independent biographies and news articles show he had a secular childhood. Sure, he went to an Indonesian school where most of the children were Muslim and there are accounts that he sometimes attended a mosque with his Islamic stepfather.",False
9943,9932.0,7953.json,barely-true,Says unsuccessful Texas abortion legislation was supported by a majority of Texans.,"abortion,polls",jerry-patterson,state land commissioner,Texas,republican,1.0,1.0,1.0,3.0,0.0,an email blast.,"President Obama, who often refers to being our brothers keepers, has a half-brother, George Obama, living in a Nairobi slum and, as of 2011, a Kenyan aunt who sold charcoal by the road. The president gave the aunt some money in 2006, according to Jacobs account. There appears to be no sign of his otherwise helping his aunt or half-brother. Finally, Zedler's statement implies the president is not addressing distant family needs. Notably, too, there appear to be no signs of the president revea...",False


In [45]:
stripped = false_ex.state_info.copy().str.strip()
false_ex.loc[:, "state_info"] = stripped

stripped = true_ex.state_info.copy().str.strip()
true_ex.loc[:, "state_info"] = stripped



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [46]:
# Clean up the variants of state info 
CANONICAL_TO_VARIANTS = {
    "Tennessee": {"Tennessee", "Tennesse"},
    "Washington D.C.": {"District of Columbia", "Washington D.C.", "Washington, D.C.", "Washington DC"},
    "Texas": {"Tex", "Texas"}, 
    "Washington": {"Washington", "Washington state"},
    "Virginia": {"Virginia", "Virgina", "Virgiia"},
    "Pennsylvania": {"Pennsylvania", "PA - Pennsylvania"},
    "Rhode Island": {"Rhode Island", "Rhode island"},
    "Ohio": {"Ohio", "ohio"}
}

def get_variant_to_canonical(can_to_var: Dict):
    variant_to_canonical = {}
    for canonical, variant in can_to_var.items():
        for var in variant:
            variant_to_canonical[var] = canonical
    return variant_to_canonical

variant_to_canonical = get_variant_to_canonical(CANONICAL_TO_VARIANTS)

In [47]:
def clean_variant(state_info, variant_to_canonical):
    if state_info in variant_to_canonical.keys():
        return variant_to_canonical[state_info]
    return state_info

true_ex.loc[:, "state_info"] = true_ex.state_info.apply(lambda x: clean_variant(x, variant_to_canonical))
false_ex.loc[:, "state_info"] = false_ex.state_info.apply(lambda x: clean_variant(x, variant_to_canonical))

# State Info Distribution
Takeaway from below seems to be that no state is considerably more inclined to "True" or "False" statements (top in each category are roughly the same)

In [48]:
state_true_counts = true_ex.state_info.value_counts()
px.bar(state_true_counts, x=state_true_counts.index, y=state_true_counts.values, labels={"index": "state", "y": "counts"}, title="True Statement State Distribution")

In [49]:
state_false_counts = false_ex.state_info.value_counts()
px.bar(state_false_counts, x=state_false_counts.index, y=state_false_counts.values, labels={"index": "state", "y": "counts"}, title="False Statement State Distribution")

In [50]:
def get_top_ngrams(corpus, ngram_len: int=1, num: int=None) -> List:
    vec = CountVectorizer(ngram_range=(ngram_len, ngram_len), stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:num]

# True Unigrams

In [51]:
top_unigrams_true = get_top_ngrams(true_ex.statement, 1, 30)
top_unigrams_true

[('says', 1217),
 ('percent', 840),
 ('state', 508),
 ('000', 467),
 ('year', 397),
 ('tax', 386),
 ('years', 382),
 ('states', 359),
 ('million', 356),
 ('people', 337),
 ('obama', 337),
 ('health', 301),
 ('jobs', 290),
 ('president', 287),
 ('new', 266),
 ('texas', 244),
 ('care', 230),
 ('taxes', 228),
 ('billion', 225),
 ('country', 223),
 ('federal', 204),
 ('united', 202),
 ('said', 194),
 ('rate', 186),
 ('budget', 186),
 ('10', 177),
 ('pay', 177),
 ('voted', 176),
 ('time', 171),
 ('government', 163)]

In [52]:
true_state_distr = pd.DataFrame(top_unigrams_true, columns=["unigram", "count"])

In [53]:
px.bar(true_state_distr, x="unigram", y="count", title="Top True Unigrams Frequency")


# False Unigrams

In [54]:
top_unigrams_false = get_top_ngrams(false_ex[false_ex.statement.notnull()].statement.str.lower(), num=30)
false_state_distr = pd.DataFrame(top_unigrams_false, columns=["unigram", "count"])
px.bar(false_state_distr, x="unigram", y="count", title="Top False Unigrams Frequency")

# False Bigrams

In [55]:
top_bigrams_false = get_top_ngrams(false_ex[false_ex.statement.notnull()].statement.str.lower(), ngram_len=2, num=30)
false_state_distr = pd.DataFrame(top_bigrams_false, columns=["bigram", "count"])
px.bar(false_state_distr, x="bigram", y="count", title="Top False Bigrams Frequency")

# True Bigrams

In [56]:
top_bigrams_true = get_top_ngrams(true_ex[true_ex.statement.notnull()].statement.str.lower(), ngram_len=2, num=30)
true_state_distr = pd.DataFrame(top_bigrams_true, columns=["bigram", "count"])
px.bar(true_state_distr, x="bigram", y="count", title="Top True Bigrams Frequency")

# Distribution of True vs. False Unigram Lens
- true examples are slightly longer
- bucketize length

In [57]:
true_ex.statement.str.split().str.len().describe()

count    5770.000000
mean       18.337782
std         7.798941
min         2.000000
25%        13.000000
50%        17.000000
75%        23.000000
max        66.000000
Name: statement, dtype: float64

In [58]:
false_ex.statement.str.split().str.len().describe()

count    4497.000000
mean       17.360907
std         7.667368
min         2.000000
25%        12.000000
50%        16.000000
75%        22.000000
max        60.000000
Name: statement, dtype: float64

In [59]:
train_df.statement.str.split().str.len().describe()

count    10267.000000
mean        17.909906
std          7.756370
min          2.000000
25%         12.000000
50%         17.000000
75%         22.000000
max         66.000000
Name: statement, dtype: float64

# Topic Modeling

- Using latent semantic analysis which performs a singular value decomposition on a document-term matrix

## Analysis

In True statements:
- Topic 2 deals with healthcare
- Topic 3 deals with jobs
- Topic 4 deals with taxes

## Limitations of LSA
- can't capture polysemy
- difficult to interpret
- bag-of-words representations are limited

In [60]:
def print_topics(model: TruncatedSVD, vectorizer: TfidfVectorizer, top_n: int=10) -> None:
    for idx, topic in enumerate(model.components_):
        print("Topic %d: " % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        print("\n")
        
def run_lsa_and_print_topics(df: pd.DataFrame, num_topics: int=5, num_words: int=5) -> None:
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
    lsa_model = TruncatedSVD(n_components=num_topics)
    tfidf_transformed = tfidf_vectorizer.fit_transform(df.statement)
    lsa_transform = lsa_model.fit_transform(tfidf_transformed)
    print_topics(lsa_model, tfidf_vectorizer)

In [61]:
run_lsa_and_print_topics(true_ex)

Topic 0: 
[('percent', 0.34464524012281295), ('says', 0.30132880648218413), ('tax', 0.17585257561512502), ('000', 0.17534865441102257), ('state', 0.1743821531756155), ('years', 0.1638552485840526), ('year', 0.15978257526126505), ('health', 0.1521997932095661), ('jobs', 0.15111344244457206), ('obama', 0.1502565628492749)]


Topic 1: 
[('percent', 0.7147932277701732), ('income', 0.12292209559014365), ('rate', 0.11947746156087194), ('40', 0.07836459868284175), ('unemployment', 0.07249685594090727), ('highest', 0.06126899271791158), ('states', 0.06052914167731957), ('90', 0.055331297705095484), ('10', 0.054496853315023394), ('pay', 0.0528855047620674)]


Topic 2: 
[('health', 0.6095048304030245), ('care', 0.5181143381533385), ('insurance', 0.19926938355170798), ('percent', 0.1863990020367275), ('plan', 0.09857701298125203), ('americans', 0.08701353302014009), ('reform', 0.07815927428119611), ('law', 0.06629286306519296), ('coverage', 0.06455653402501327), ('people', 0.05571378367086508)]



In [62]:
run_lsa_and_print_topics(false_ex)

Topic 0: 
[('says', 0.3714031210752551), ('health', 0.25917374999504533), ('obama', 0.24357514528885899), ('care', 0.23890482796898854), ('president', 0.20407151781973248), ('percent', 0.1816533702375591), ('tax', 0.1811791626006288), ('barack', 0.180412359933636), ('state', 0.153691618323912), ('000', 0.13400312098799377)]


Topic 1: 
[('health', 0.536210716576261), ('care', 0.5094262003452869), ('law', 0.15795738233166845), ('insurance', 0.1189815077865269), ('government', 0.0948405139293412), ('reform', 0.07583208914639702), ('plan', 0.0571723780007409), ('affordable', 0.05232879396516892), ('takeover', 0.049964987377783224), ('federal', 0.04730984644222644)]


Topic 2: 
[('obama', 0.43365698510629047), ('barack', 0.3683637278155912), ('president', 0.3557462438849415), ('health', 0.2529532448451656), ('care', 0.24143428231060846), ('obamas', 0.0743852714467124), ('law', 0.06291036690344873), ('muslim', 0.04879033398556757), ('insurance', 0.03653688942513638), ('going', 0.03392811098

In [63]:
run_lsa_and_print_topics(true_ex, num_topics=10, num_words=8)

Topic 0: 
[('percent', 0.3446706904479291), ('says', 0.3012801716252003), ('tax', 0.17585723482694054), ('000', 0.175331706570276), ('state', 0.17435193024215284), ('years', 0.1638010626536184), ('year', 0.1596352386078747), ('health', 0.15220917937468056), ('jobs', 0.15121880694779619), ('obama', 0.15020245916893288)]


Topic 1: 
[('percent', 0.7144389829488037), ('rate', 0.1289961999758036), ('income', 0.12061196992370224), ('40', 0.07790485426876723), ('unemployment', 0.07687061294476774), ('highest', 0.06652335731549686), ('10', 0.056451000971364906), ('90', 0.05500323718795707), ('states', 0.053377502555588596), ('50', 0.0499145749890012)]


Topic 2: 
[('health', 0.6122457983306565), ('care', 0.5125264317116631), ('insurance', 0.2105758313924654), ('percent', 0.17731581566931748), ('americans', 0.09513090481981622), ('plan', 0.09200699626883704), ('reform', 0.07581240576182985), ('law', 0.06767144894333524), ('coverage', 0.06666569806115678), ('people', 0.062055756319223755)]


To

In [64]:
run_lsa_and_print_topics(false_ex, num_topics=10, num_words=8)

Topic 0: 
[('says', 0.37132038335314327), ('health', 0.25916635478647465), ('obama', 0.24367337085945728), ('care', 0.2389335236519988), ('president', 0.2039449098186809), ('percent', 0.18162043036200315), ('tax', 0.18122862875066104), ('barack', 0.18043899952230652), ('state', 0.15371986370200139), ('000', 0.13401947741903353)]


Topic 1: 
[('health', 0.536457977934852), ('care', 0.5094987564748457), ('law', 0.15921015775880884), ('insurance', 0.11926931525611946), ('government', 0.09624801524878407), ('reform', 0.07618726446444307), ('plan', 0.056013700434220055), ('affordable', 0.052284630247438936), ('takeover', 0.0500274385592933), ('federal', 0.04908144865211871)]


Topic 2: 
[('obama', 0.4312580214650035), ('barack', 0.36454493220384065), ('president', 0.35689651899329705), ('health', 0.2522787501045307), ('care', 0.24038910961271803), ('obamas', 0.07395172888089133), ('law', 0.06380600690980591), ('muslim', 0.04833696551446298), ('government', 0.037888079745267464), ('insurance

# Sentiment Analysis
- Utilizing Vader sentiment analysis: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf
- Not too much of a difference between true and false statements
- Perhaps a bit in the compound metric

In [65]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [66]:
analyzer = SentimentIntensityAnalyzer()

In [67]:
def extract_compound_sentiment(statement: str) -> float:
    return analyzer.polarity_scores(statement)["compound"]

def extract_pos_sentiment(statement: str) -> float:
    return analyzer.polarity_scores(statement)["pos"]

def extract_neg_sentiment(statement: str) -> float:
    return analyzer.polarity_scores(statement)["neg"]

def extract_neu_sentiment(statement: str) -> float:
    return analyzer.polarity_scores(statement)["neu"]

In [68]:
true_compound = true_ex.statement.apply(lambda x: extract_compound_sentiment(x))

In [69]:
px.histogram(true_compound, x=true_compound.values, labels={"x": "polarity"}, title="True Compound Polarity")

In [70]:
true_compound.describe()

count    5770.000000
mean       -0.019625
std         0.394974
min        -0.974400
25%        -0.296000
50%         0.000000
75%         0.250000
max         0.942300
Name: statement, dtype: float64

In [71]:
true_pos = true_ex.statement.apply(lambda x: extract_pos_sentiment(x))

In [72]:
true_neg = true_ex.statement.apply(lambda x: extract_neg_sentiment(x))

In [73]:
false_compound = false_ex.statement.apply(lambda x: extract_compound_sentiment(x))
false_pos = false_ex.statement.apply(lambda x: extract_pos_sentiment(x))
false_neg = false_ex.statement.apply(lambda x: extract_neg_sentiment(x))

In [74]:
px.histogram(false_compound, x=false_compound.values, labels={"x": "polarity"}, title="False Compound Polarity")

In [75]:
false_compound.describe()

count    4497.000000
mean       -0.002767
std         0.382078
min        -0.973500
25%        -0.273200
50%         0.000000
75%         0.273200
max         0.937100
Name: statement, dtype: float64

In [76]:
true_pos.describe()

count    5770.000000
mean        0.066705
std         0.095489
min         0.000000
25%         0.000000
50%         0.000000
75%         0.124000
max         0.598000
Name: statement, dtype: float64

In [77]:
false_pos.describe()

count    4497.000000
mean        0.073696
std         0.102106
min         0.000000
25%         0.000000
50%         0.000000
75%         0.137000
max         0.672000
Name: statement, dtype: float64

In [78]:
true_neg.describe()

count    5770.000000
mean        0.075679
std         0.110518
min         0.000000
25%         0.000000
50%         0.000000
75%         0.137000
max         0.796000
Name: statement, dtype: float64

In [79]:
false_neg.describe()

count    4497.000000
mean        0.074169
std         0.110151
min         0.000000
25%         0.000000
50%         0.000000
75%         0.138000
max         0.783000
Name: statement, dtype: float64