In [1]:
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



In [41]:
# Load the dataset and clean it

df = pd.read_csv("train.tsv", sep="\t")

df.drop(columns=["2635.json", "dwayne-bohac", "State representative", "Texas", "republican", "a mailer"], inplace=True)
df.rename(columns={"": "Index",
                    "false": "correctness",
                    "Says the Annies List political group supports third-trimester abortions on demand.": "Text",
                    "abortion": "Theme",
                    "0": "barely true counts",
                    "1": "false counts",
                    "0.1": "half true counts",
                    "0.2": "mostly true counts",
                    "0.3": "pants on fire counts",}, inplace=True)

# download necessary NLTK resources (only once)
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

df["Cleaned text"] = df["Text"].apply(
    lambda x: " ".join(
        [word for word in word_tokenize(str(x).lower()) if word.isalnum() and word not in stop_words]
    )
)

df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,correctness,Text,Theme,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,Cleaned text
0,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",0.0,0.0,1.0,1.0,0.0,decline coal start started natural gas took st...
1,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,70.0,71.0,160.0,163.0,9.0,hillary clinton agrees john mccain voting give...
2,false,Health care reform legislation is likely to ma...,health-care,7.0,19.0,3.0,5.0,44.0,health care reform legislation likely mandate ...
3,half-true,The economic turnaround started at the end of ...,"economy,jobs",15.0,9.0,20.0,19.0,2.0,economic turnaround started end term
4,true,The Chicago Bears have had more starting quart...,education,0.0,3.0,2.0,5.0,1.0,chicago bears starting quarterbacks last 10 ye...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [5]:
corpus = df["Cleaned text"]
corpus


0        decline coal start started natural gas took st...
1        hillary clinton agrees john mccain voting give...
2        health care reform legislation likely mandate ...
3                     economic turnaround started end term
4        chicago bears starting quarterbacks last 10 ye...
                               ...                        
10234    larger number shark attacks florida cases vote...
10235     democrats become party atlanta metro area blacks
10236    says alternative social security operates galv...
10237           lifting cuban embargo allowing travel cuba
10238    department veterans affairs manual telling vet...
Name: Cleaned text, Length: 10239, dtype: object

In [None]:
v = TfidfVectorizer()
transformed_output = v.fit_transform(corpus)
print(v.vocabulary_)



In [11]:
feature_names = v.get_feature_names_out()

for word in feature_names[1000:1100]:
    indx = v.vocabulary_.get(word)
    print(f"{word}: {v.idf_[indx]}")

arkansas: 7.931471805599453
arlen: 9.13544460992539
arm: 8.442297429365443
armed: 8.154615356913663
armies: 9.540909718033554
armor: 9.13544460992539
armored: 9.540909718033554
arms: 7.836161625795128
army: 7.461468176353717
aromatherapy: 9.540909718033554
aronberg: 9.540909718033554
aronbergs: 9.540909718033554
around: 6.427394408823179
array: 9.13544460992539
arrest: 8.154615356913663
arrested: 7.669107541131962
arrests: 8.442297429365443
arrived: 9.540909718033554
arriving: 9.13544460992539
arrow: 9.540909718033554
arsenal: 9.13544460992539
art: 8.442297429365443
arthritis: 9.540909718033554
articles: 9.540909718033554
artificial: 9.540909718033554
artificially: 9.540909718033554
artists: 9.540909718033554
arts: 9.13544460992539
arvada: 9.540909718033554
aryan: 9.540909718033554
asbig: 9.540909718033554
ashbritt: 9.540909718033554
ashtiani: 9.540909718033554
asia: 9.540909718033554
aside: 9.13544460992539
ask: 7.400843554537283
asked: 6.596470738867113
askedthedivision: 9.5409097180

In [42]:
tfidf_df = pd.DataFrame(transformed_output.toarray(), columns=feature_names)
# tfidf_df.iloc[0].sort_values(ascending=False).head(10)
tfidf_df.head()


Unnamed: 0,02,05,09,10,100,100th,103,104,105,106,...,zip,zippo,zombie,zombies,zone,zones,zoning,zoo,zuckerberg,zuckerbergs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.173207,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X = tfidf_df.values
y = 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])