In [40]:
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score


In [25]:
# Load the dataset and clean it

df = pd.read_csv("train.tsv", sep="\t")

df.drop(columns=["2635.json", "dwayne-bohac", "State representative", "Texas", "republican", "a mailer"], inplace=True)
df.rename(columns={"": "Index",
                    "false": "correctness",
                    "Says the Annies List political group supports third-trimester abortions on demand.": "Text",
                    "abortion": "Theme",
                    "0": "barely true counts",
                    "1": "false counts",
                    "0.1": "half true counts",
                    "0.2": "mostly true counts",
                    "0.3": "pants on fire counts",}, inplace=True)

# download necessary NLTK resources (only once)
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("omw-1.4")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger_eng")

# set of stop words to filter out
stop_words = set(stopwords.words("english"))

# initialize lemmatizer
lemmatizer = nltk.WordNetLemmatizer()

# clean text from stop words and lemmatize
df["Cleaned text"] = df["Text"].apply(
    lambda x: " ".join(
        [word for word in word_tokenize(str(x).lower()) if word.isalnum() and word not in stop_words]
    )
)

df["Cleaned text"] = df["Cleaned text"].apply(
    lambda x: " ".join(
        [lemmatizer.lemmatize(w) for w in word_tokenize(str(x).lower()) if w.isalnum()]
    )
)

df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       d

Unnamed: 0,correctness,Text,Theme,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,Cleaned text
0,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",0.0,0.0,1.0,1.0,0.0,decline coal start started natural gas took st...
1,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,70.0,71.0,160.0,163.0,9.0,hillary clinton agrees john mccain voting give...
2,false,Health care reform legislation is likely to ma...,health-care,7.0,19.0,3.0,5.0,44.0,health care reform legislation likely mandate ...
3,half-true,The economic turnaround started at the end of ...,"economy,jobs",15.0,9.0,20.0,19.0,2.0,economic turnaround started end term
4,true,The Chicago Bears have had more starting quart...,education,0.0,3.0,2.0,5.0,1.0,chicago bear starting quarterback last 10 year...


In [26]:
# Get labels from the 'corectness' column
label_order = ['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true']
label_map = {label: idx for idx, label in enumerate(label_order)}

# Encode the labels
df['label'] = df['correctness'].map(label_map)


In [27]:
corpus = df["Cleaned text"]
corpus


0        decline coal start started natural gas took st...
1        hillary clinton agrees john mccain voting give...
2        health care reform legislation likely mandate ...
3                     economic turnaround started end term
4        chicago bear starting quarterback last 10 year...
                               ...                        
10234    larger number shark attack florida case voter ...
10235       democrat become party atlanta metro area black
10236    say alternative social security operates galve...
10237           lifting cuban embargo allowing travel cuba
10238    department veteran affair manual telling veter...
Name: Cleaned text, Length: 10239, dtype: object

In [28]:
v = TfidfVectorizer()
transformed_output = v.fit_transform(corpus)
print(v.vocabulary_)



In [29]:
feature_names = v.get_feature_names_out()

for word in feature_names[1000:1100]:
    indx = v.vocabulary_.get(word)
    print(f"{word}: {v.idf_[indx]}")

attached: 9.13544460992539
attack: 6.17361388804708
attacked: 7.5260066974912885
attacker: 9.13544460992539
attacking: 8.442297429365443
attempt: 7.749150248805498
attempted: 7.931471805599453
attempting: 9.13544460992539
attend: 7.669107541131962
attendance: 8.442297429365443
attended: 8.442297429365443
attending: 9.540909718033554
attention: 8.624618986159398
attitude: 8.847762537473608
attorney: 6.520484831889191
attract: 9.540909718033554
attracted: 8.847762537473608
attracting: 9.13544460992539
attribute: 9.540909718033554
attributed: 8.624618986159398
attrition: 9.13544460992539
atvs: 9.540909718033554
atwaters: 9.13544460992539
au: 9.540909718033554
audience: 8.847762537473608
audit: 8.036832321257279
auditing: 9.540909718033554
auditor: 8.624618986159398
august: 8.847762537473608
aunt: 9.13544460992539
auschwitz: 9.540909718033554
ausleys: 9.540909718033554
austin: 5.929991805389329
austinincluding: 9.540909718033554
austinites: 8.847762537473608
australia: 8.847762537473608
au

In [30]:
tfidf_df = pd.DataFrame(transformed_output.toarray(), columns=feature_names)
# tfidf_df.iloc[0].sort_values(ascending=False).head(10)
tfidf_df.head()


Unnamed: 0,02,05,09,10,100,100th,103,104,105,106,...,zimmerman,zinn,zip,zippo,zombie,zone,zoning,zoo,zuckerberg,zuckerbergs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.177127,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
X = tfidf_df.values
y = df['label']

In [None]:
clf = LogisticRegression(max_iter=2000)
clf.fit(X, y)


In [34]:
y_pred = clf.predict(X)


In [42]:
mse = mean_squared_error(y, y_pred)
accuracy = accuracy_score(y, y_pred)
accuracy

0.6452778591659342