In [20]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.metrics import cohen_kappa_score

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [21]:
df = pd.read_csv('PolitifactDatasetFeatures.csv')
df

Unnamed: 0.1,Unnamed: 0,Statement Author,Statement,Rating,cleaned,statement_length,word_count,sentence_count,unique_words,lexical_richness,punctuation_count,noun_%,pronoun_%,verb_%,adj_%,determiner_%,foreign_%,cleaned_tokenized
0,0,Fentrice Driskell,"""$1 of every $3 (Ron DeSantis) spends comes fr...",true,every ron desantis spends come federal government,73,49,1,12,10.00,7,0.157895,0.000000,0.000000,0.052632,0.105263,0.0,"['every', 'ron', 'desantis', 'spends', 'come',..."
1,1,Robert Ortt,If New York’s proposed limits on natural gas i...,true,new york proposed limit natural gas building t...,127,98,1,19,123.48,2,0.407407,0.000000,0.185185,0.111111,0.074074,0.0,"['new', 'york', 'proposed', 'limit', 'natural'..."
2,2,Tony Evers,“Wisconsin is the nation’s top cranberry produ...,true,wisconsin nation top cranberry producer fact f...,122,87,1,18,101.08,2,0.428571,0.035714,0.071429,0.071429,0.071429,0.0,"['wisconsin', 'nation', 'top', 'cranberry', 'p..."
3,3,Morgan Luttrell,"""Biden drained America's Strategic Petroleum R...",true,biden drained america strategic petroleum rese...,86,68,1,12,12.00,4,0.312500,0.000000,0.062500,0.000000,0.062500,0.0,"['biden', 'drained', 'america', 'strategic', '..."
4,4,Melissa Agard,"""Historically, our spring elections (including...",true,historically spring election including state s...,117,85,1,16,16.00,6,0.272727,0.090909,0.136364,0.000000,0.045455,0.0,"['historically', 'spring', 'election', 'includ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,1135,TikTok posts,"“As of today, no one has the right to film or ...",pants-fire,today one right film photograph mr biden climb...,138,91,3,22,102.06,6,0.454545,0.030303,0.030303,0.000000,0.090909,0.0,"['today', 'one', 'right', 'film', 'photograph'..."
1136,1136,Facebook posts,"""mRNA is not a vaccine"" — it's ""actually an op...",pants-fire,mrna vaccine actually operating system run bil...,105,66,1,19,112.00,7,0.307692,0.038462,0.038462,0.000000,0.076923,0.0,"['mrna', 'vaccine', 'actually', 'operating', '..."
1137,1137,Facebook posts,Video says COVID-19 vaccines are “weapons of m...,pants-fire,video say covid vaccine weapon mass destructio...,100,69,1,16,17.00,2,0.380952,0.000000,0.142857,0.095238,0.047619,0.0,"['video', 'say', 'covid', 'vaccine', 'weapon',..."
1138,1138,Facebook posts,Says Ben Shapiro said on Twitter that his “red...,pants-fire,say ben shapiro said twitter red pill moment s...,96,69,1,17,17.00,1,0.450000,0.050000,0.200000,0.050000,0.000000,0.0,"['say', 'ben', 'shapiro', 'said', 'twitter', '..."


In [22]:
import nltk
import nltk.stem as ns
import string
import re

ps = ns.PorterStemmer()
lemma = ns.WordNetLemmatizer()

from nltk.tokenize import word_tokenize, sent_tokenize
def remove_punctuation(x):
    punctuation = string.punctuation
    no_punct = "".join([word for word in x if word not in punctuation])
    return no_punct

def remove_stopwords(x):
    stopwords = nltk.corpus.stopwords.words('english')
    no_sw = [word for word in x if word not in stopwords]
    return no_sw

#function built to use either stemming or lematization
def lemmatize(x):
    lemmatized = [lemma.lemmatize(word) for word in x]
    return lemmatized


#all of those functions inside one function to keep code clean
def clean_data(x):
    #tokens = re.sub("[^a-zA-Z]", " ", x.lower())
    essay_v = re.sub("[^a-zA-Z]", " ", x)
    tokens = essay_v.lower().split()
    no_sw = remove_stopwords(tokens)
    root = lemmatize(no_sw)
    cleaned = ' '.join(root)
    return cleaned

def clean_tokenize(x):
    #tokens = re.sub("[^a-zA-Z]", " ", x.lower())
    essay_v = re.sub("[^a-zA-Z]", " ", x)
    tokens = essay_v.lower().split()
    no_sw = remove_stopwords(tokens)
    root = lemmatize(no_sw)
    return root

In [23]:
vectorizer = TfidfVectorizer(analyzer=clean_tokenize)
feat_df = df[['Statement Author','statement_length', 'word_count', 'sentence_count', 'unique_words', 'lexical_richness', 'punctuation_count', 'noun_%', 'pronoun_%', 'verb_%', 'adj_%', 'determiner_%', 'foreign_%']]
X = pd.concat([pd.DataFrame(vectorizer.fit_transform(df['Statement']).toarray()), feat_df], axis = 1)
y = df['Rating']

In [24]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,sentence_count,unique_words,lexical_richness,punctuation_count,noun_%,pronoun_%,verb_%,adj_%,determiner_%,foreign_%
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,12,10.00,7,0.157895,0.000000,0.000000,0.052632,0.105263,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,19,123.48,2,0.407407,0.000000,0.185185,0.111111,0.074074,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,18,101.08,2,0.428571,0.035714,0.071429,0.071429,0.071429,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,12,12.00,4,0.312500,0.000000,0.062500,0.000000,0.062500,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,16,16.00,6,0.272727,0.090909,0.136364,0.000000,0.045455,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,22,102.06,6,0.454545,0.030303,0.030303,0.000000,0.090909,0.0
1136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,19,112.00,7,0.307692,0.038462,0.038462,0.000000,0.076923,0.0
1137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,16,17.00,2,0.380952,0.000000,0.142857,0.095238,0.047619,0.0
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,17,17.00,1,0.450000,0.050000,0.200000,0.050000,0.000000,0.0


In [25]:
y

0             true
1             true
2             true
3             true
4             true
           ...    
1135    pants-fire
1136    pants-fire
1137    pants-fire
1138    pants-fire
1139    pants-fire
Name: Rating, Length: 1140, dtype: object

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

xgb = XGBClassifier()

le = LabelEncoder()
y = le.fit_transform(y)

num_cols = ['statement_length', 'word_count', 'sentence_count', 'unique_words', 'lexical_richness', 'punctuation_count']
bin_cols = ['Statement Author']

ct = ColumnTransformer([('standard_scaler', StandardScaler(), num_cols), 
                        ('label_encoder', OrdinalEncoder(), bin_cols)])

In [32]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [33]:
X = ct.fit_transform(X)



In [34]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score

clfXGB = Pipeline(steps = [('XGBoost', XGBClassifier())])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [35]:
clfXGB.fit(X_train, y_train)

Pipeline(steps=[('XGBoost',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               feature_types=None, gamma=0, gpu_id=-1,
                               grow_policy='depthwise', importance_type=None,
                               interaction_constraints='',
                               learning_rate=0.300000012, max_bin=256,
                               max_cat_threshold=64, max_cat_to_onehot=4,
                               max_delta_step=0, max_depth=6, max_leaves=0,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, predictor='auto',
                

In [30]:
clfXGB.score(X_test, y_test)

ValueError: Found unknown categories ['Chad Mayes', 'Norm Eisen', 'Texas GOP', 'Gwen Graham', 'Kelly Alexander', 'Caleb Rowden', 'Silvestre Reyes', 'Eric Trump', 'Energy Transfer', 'Pearson Sharp', 'Desiree Rogers', 'Nick Freitas', 'Tammy Duckworth', 'David Menchetti', 'Rand Paul', 'Roger Stone', 'VoteVets.org', 'Keith Pekau', 'Jake Tapper', 'Ted Yoho', 'Lisa McClain', 'Robert F. Kennedy, Jr.', 'Ann Wagner', 'Rana Foroohar', 'Brian Tyler Cohen', 'John Chiang', 'Ted Nugent', 'Dennis Kucinich', 'Adam Putnam', 'National Republican Senatorial Committee', 'A Better Wisconsin Together', 'Kevin Nicholson', 'Mandy Cohen', 'Josh Hawley', 'Richard Cordray', 'Jeff Jackson', 'Van Wanggaard', 'Bob Good', 'Lauren Boebert', 'Joseph Crowley', 'Devin LeMahieu', 'Chris Kennedy', 'Monica Crowley', 'Troy LaRaviere', 'Ivanka Trump', 'The Coalition for Public Schools', 'Ron DeSantis', 'Illinois Policy Institute', 'Bono', 'Eric Holcomb', 'Emily Compagno', 'Jenny Wilson', 'Rachel Campos-Duffy', 'Darrell Steinberg', 'Michelle Obama', 'Joni Ernst', 'Ann Teich', 'Derrick Crowe', 'Allen Thomas', 'Patricia Pike', 'Jeff Brandes', 'Phil Scott'] in column 0 during transform