In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('PolitifactDataset.csv')
df

Unnamed: 0,Statement Author,Statement,Rating
0,Fentrice Driskell,"""$1 of every $3 (Ron DeSantis) spends comes fr...",true
1,Robert Ortt,If New York’s proposed limits on natural gas i...,true
2,Tony Evers,“Wisconsin is the nation’s top cranberry produ...,true
3,Morgan Luttrell,"""Biden drained America's Strategic Petroleum R...",true
4,Melissa Agard,"""Historically, our spring elections (including...",true
...,...,...,...
1135,TikTok posts,"“As of today, no one has the right to film or ...",pants-fire
1136,Facebook posts,"""mRNA is not a vaccine"" — it's ""actually an op...",pants-fire
1137,Facebook posts,Video says COVID-19 vaccines are “weapons of m...,pants-fire
1138,Facebook posts,Says Ben Shapiro said on Twitter that his “red...,pants-fire


In [5]:
import nltk
import nltk.stem as ns
import string
import re

ps = ns.PorterStemmer()
lemma = ns.WordNetLemmatizer()

from nltk.tokenize import word_tokenize, sent_tokenize
def remove_punctuation(x):
    punctuation = string.punctuation
    no_punct = "".join([word for word in x if word not in punctuation])
    return no_punct

def remove_stopwords(x):
    stopwords = nltk.corpus.stopwords.words('english')
    no_sw = [word for word in x if word not in stopwords]
    return no_sw

#function built to use either stemming or lematization
def lemmatize(x):
    lemmatized = [lemma.lemmatize(word) for word in x]
    return lemmatized


#all of those functions inside one function to keep code clean
def clean_data(x):
    #tokens = re.sub("[^a-zA-Z]", " ", x.lower())
    essay_v = re.sub("[^a-zA-Z]", " ", x)
    tokens = essay_v.lower().split()
    no_sw = remove_stopwords(tokens)
    root = lemmatize(no_sw)
    cleaned = ' '.join(root)
    return cleaned

def clean_tokenize(x):
    #tokens = re.sub("[^a-zA-Z]", " ", x.lower())
    essay_v = re.sub("[^a-zA-Z]", " ", x)
    tokens = essay_v.lower().split()
    no_sw = remove_stopwords(tokens)
    root = lemmatize(no_sw)
    return root


In [6]:
from collections import Counter
from nltk import sent_tokenize
def count_unique(x):
    tokens = x.lower().split()
    counts = Counter(tokens)
    unique = sum(value == 1 for value in counts.values())
    return unique


In [7]:
sent = 'i got my tooth removed. i dont wanna talk about it!' #example sent to test on

In [8]:
from string import punctuation
def punc_count(x):
    return len([c for c in x if c in punctuation])

In [9]:
from lexicalrichness import LexicalRichness
def lexical_richness(x):
    lex = LexicalRichness(x)
    return lex.mtld()

In [10]:
sent = 'Mr. President, Joe has been identified as the pilot of the 9/11 terrorist attack'

In [96]:
def pos_noun(x):
    tags = nltk.pos_tag(word_tokenize(x))
    count = sum(value[1] == 'NNP' or value[1] == 'NNS' or value[1] == 'NNPS' or value[1] == 'NN' for value in tags)
    return count/len(tags)


def pos_verb(x):
    tags = nltk.pos_tag(word_tokenize(x))
    count = sum(value[1] == 'VBP' or value[1] == 'VBN' or value[1] == 'VBG' or value[1] == 'VBD' or value[1] == 'VB' for value in tags)
    return count/len(tags)


def pos_pronoun(x):
    tags = nltk.pos_tag(word_tokenize(x))
    count = sum(value[1] == 'PRP' or value[1] == 'PRP$' for value in tags)
    return count/len(tags)


def pos_interjection(x):
    tags = nltk.pos_tag(word_tokenize(x))
    count = sum(value[1] == 'UH' for value in tags)
    return count/len(tags)


def pos_adjective(x):
    tags = nltk.pos_tag(word_tokenize(x))
    count = sum(value[1] == 'JJ' for value in tags)
    return count/len(tags)

def pos_determiner(x):
    tags = nltk.pos_tag(word_tokenize(x))
    count = sum(value[1] == 'DT' for value in tags)
    return count/len(tags)

def pos_foreign(x):
    tags = nltk.pos_tag(word_tokenize(x))
    count = sum(value[1] == 'FW' for value in tags)
    return count/len(tags)


In [97]:
# essay_length  word_count  sentence_count  unique_words  lexical_richness  punctuation_count  spelling_errors

In [98]:
df['cleaned'] = df['Statement'].apply(lambda x : clean_data(x))
df['cleaned_tokenized'] = df['Statement'].apply(lambda x : clean_tokenize(x))
df['statement_length'] = df['Statement'].apply(lambda x: len(x))
df['word_count'] = df['cleaned'].apply(lambda x: len(x))
df['sentence_count'] = df['Statement'].apply(lambda x:len(sent_tokenize(x)))
df['unique_words'] = df['Statement'].apply(lambda x: count_unique(x))
df['lexical_richness'] = df['Statement'].apply(lambda x: lexical_richness(x))
df['punctuation_count'] = df['Statement'].apply(lambda x: punc_count(x))
df['noun_%'] = df['Statement'].apply(lambda x: pos_noun(x))
df['pronoun_%'] = df['Statement'].apply(lambda x: pos_pronoun(x))
df['verb_%'] = df['Statement'].apply(lambda x: pos_verb(x))
df['adj_%'] = df['Statement'].apply(lambda x: pos_adjective(x))
df['determiner_%'] = df['Statement'].apply(lambda x: pos_determiner(x))
df['foreign_%'] = df['Statement'].apply(lambda x: pos_foreign(x))


In [99]:
df.to_pickle('PolitifactFeatures.pkl')

In [100]:
df.to_csv('PolitifactDatasetFeatures.csv')

In [101]:
df

Unnamed: 0,Statement Author,Statement,Rating,cleaned,statement_length,word_count,sentence_count,unique_words,lexical_richness,punctuation_count,noun_%,pronoun_%,verb_%,adj_%,determiner_%,foreign_%,cleaned_tokenized
0,Fentrice Driskell,"""$1 of every $3 (Ron DeSantis) spends comes fr...",true,every ron desantis spends come federal government,73,49,1,12,10.00,7,0.157895,0.000000,0.000000,0.052632,0.105263,0.0,"[every, ron, desantis, spends, come, federal, ..."
1,Robert Ortt,If New York’s proposed limits on natural gas i...,true,new york proposed limit natural gas building t...,127,98,1,19,123.48,2,0.407407,0.000000,0.185185,0.111111,0.074074,0.0,"[new, york, proposed, limit, natural, gas, bui..."
2,Tony Evers,“Wisconsin is the nation’s top cranberry produ...,true,wisconsin nation top cranberry producer fact f...,122,87,1,18,101.08,2,0.428571,0.035714,0.071429,0.071429,0.071429,0.0,"[wisconsin, nation, top, cranberry, producer, ..."
3,Morgan Luttrell,"""Biden drained America's Strategic Petroleum R...",true,biden drained america strategic petroleum rese...,86,68,1,12,12.00,4,0.312500,0.000000,0.062500,0.000000,0.062500,0.0,"[biden, drained, america, strategic, petroleum..."
4,Melissa Agard,"""Historically, our spring elections (including...",true,historically spring election including state s...,117,85,1,16,16.00,6,0.272727,0.090909,0.136364,0.000000,0.045455,0.0,"[historically, spring, election, including, st..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,TikTok posts,"“As of today, no one has the right to film or ...",pants-fire,today one right film photograph mr biden climb...,138,91,3,22,102.06,6,0.454545,0.030303,0.030303,0.000000,0.090909,0.0,"[today, one, right, film, photograph, mr, bide..."
1136,Facebook posts,"""mRNA is not a vaccine"" — it's ""actually an op...",pants-fire,mrna vaccine actually operating system run bil...,105,66,1,19,112.00,7,0.307692,0.038462,0.038462,0.000000,0.076923,0.0,"[mrna, vaccine, actually, operating, system, r..."
1137,Facebook posts,Video says COVID-19 vaccines are “weapons of m...,pants-fire,video say covid vaccine weapon mass destructio...,100,69,1,16,17.00,2,0.380952,0.000000,0.142857,0.095238,0.047619,0.0,"[video, say, covid, vaccine, weapon, mass, des..."
1138,Facebook posts,Says Ben Shapiro said on Twitter that his “red...,pants-fire,say ben shapiro said twitter red pill moment s...,96,69,1,17,17.00,1,0.450000,0.050000,0.200000,0.050000,0.000000,0.0,"[say, ben, shapiro, said, twitter, red, pill, ..."


In [88]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.metrics import cohen_kappa_score

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [2]:
vectorizer = TfidfVectorizer(analyzer=clean_tokenize)
feat_df = df[['statement_length', 'word_count', 'sentence_count', 'unique_words', 'lexical_richness', 'punctuation_count', 'noun_%', 'pronoun_%', 'verb_%', 'adj_%', 'determiner_%', 'foreign_%']]
X = pd.concat([pd.DataFrame(vectorizer.fit_transform(df['Statement']).toarray()), feat_df], axis = 1)
y = df['Rating']

NameError: name 'TfidfVectorizer' is not defined

In [104]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,sentence_count,unique_words,lexical_richness,punctuation_count,noun_%,pronoun_%,verb_%,adj_%,determiner_%,foreign_%
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,12,10.00,7,0.157895,0.000000,0.000000,0.052632,0.105263,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,19,123.48,2,0.407407,0.000000,0.185185,0.111111,0.074074,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,18,101.08,2,0.428571,0.035714,0.071429,0.071429,0.071429,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,12,12.00,4,0.312500,0.000000,0.062500,0.000000,0.062500,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,16,16.00,6,0.272727,0.090909,0.136364,0.000000,0.045455,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,22,102.06,6,0.454545,0.030303,0.030303,0.000000,0.090909,0.0
1136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,19,112.00,7,0.307692,0.038462,0.038462,0.000000,0.076923,0.0
1137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,16,17.00,2,0.380952,0.000000,0.142857,0.095238,0.047619,0.0
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,17,17.00,1,0.450000,0.050000,0.200000,0.050000,0.000000,0.0


In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

xgb = XGBClassifier()

le = LabelEncoder()
y = le.fit_transform(y)

num_cols = ['statement_length', 'word_count', 'sentence_count', 'unique_words', 'lexical_richness', 'punctuation_count']

ct = ColumnTransformer([('standard_scaler', StandardScaler(), num_cols)])



NameError: name 'y' is not defined

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score

clfXGB_nocv = Pipeline(steps = [('preprocessor', ct), ('XGBoost', XGBClassifier())])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)