In [None]:
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


In [None]:
# Load the dataset and clean it

df = pd.read_csv("train.tsv", sep="\t")

df.drop(columns=["2635.json", "dwayne-bohac", "State representative", "Texas", "republican", "a mailer"], inplace=True)
df.rename(columns={"": "Index",
                    "false": "correctness",
                    "Says the Annies List political group supports third-trimester abortions on demand.": "Text",
                    "abortion": "Theme",
                    "0": "barely true counts",
                    "1": "false counts",
                    "0.1": "half true counts",
                    "0.2": "mostly true counts",
                    "0.3": "pants on fire counts",}, inplace=True)

# download necessary NLTK resources (only once)
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("omw-1.4")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger_eng")

# set of stop words to filter out
stop_words = set(stopwords.words("english"))

# initialize lemmatizer
lemmatizer = nltk.WordNetLemmatizer()

# clean text from stop words and lemmatize
df["Cleaned text"] = df["Text"].apply(
    lambda x: " ".join(
        [word for word in word_tokenize(str(x).lower()) if word.isalnum() and word not in stop_words]
    )
)

df["Cleaned text"] = df["Cleaned text"].apply(
    lambda x: " ".join(
        [lemmatizer.lemmatize(w) for w in word_tokenize(str(x).lower()) if w.isalnum()]
    )
)

df.head()

In [None]:
# Get labels from the 'corectness' column
label_order = ['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true']
label_map = {label: idx for idx, label in enumerate(label_order)}

# Encode the labels
df['label'] = df['correctness'].map(label_map)


In [None]:
corpus = df["Cleaned text"]
corpus


In [None]:
v = TfidfVectorizer()
transformed_output = v.fit_transform(corpus)
print(v.vocabulary_)

In [None]:
feature_names = v.get_feature_names_out()

for word in feature_names[1000:1100]:
    indx = v.vocabulary_.get(word)
    print(f"{word}: {v.idf_[indx]}")

In [None]:
tfidf_df = pd.DataFrame(transformed_output.toarray(), columns=feature_names)
# tfidf_df.iloc[0].sort_values(ascending=False).head(10)
tfidf_df.head()


In [None]:
X = tfidf_df.values
y = df['label']

In [None]:
clf = LogisticRegression()
clf.fit(X, y)