In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import mysql.connector
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords

In [2]:
tfidfvec = TfidfVectorizer()
LR = LogisticRegression()

In [3]:
#Input is not-cleaned documents that are labeled 0 for fake and 1 for real
#must be dataframe with columns 'text' and 'label'
#This function cleans the text and tests the model
def test_pipeline(data):
    #clean the text
    data['text'] = data['text'].apply(clean_text)
    #vectorize it
    xv = tfidfvec.transform(data['text'])
    #test the vector data against the label
    score = LR.score(xv, data['label'])
    return score

In [4]:
#Input is not-cleaned documents that are labeled 0 for fake and 1 for real
#must be dataframe with columns 'text' and 'label'
#This function cleans the text and trains the model
def train_pipeline(data):
    #clean the text
    data['text'] = data['text'].apply(clean_text)
    #vectorize it
    xv = tfidfvec.fit_transform(data['text'])
    #fit the vector data to the label
    LR.fit(xv, data['label'])


In [5]:
#Function to get a query from our specific database and return it as a dataframe
def get_sql_data(query):
    # MySQL info
    config = {
        'user': 'project',
        'password': 'COMP541',
        'host': '150.230.44.118',
        'database': 'project'
    }
    #connect
    conn = mysql.connector.connect(**config)
    #read into dataframe
    data = pd.read_sql(query, conn)
    #close connection
    conn.close()
    return data

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
#clean text works on one document at a time so needs to be applied with .apply()
def clean_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Remove special characters and convert to lowercase
    clean_tokens = [re.sub(r'[^a-zA-Z\s]', '', token).lower() for token in tokens]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    stop_words.add('said')
    stop_words.add('reuters')
    clean_tokens = [token for token in clean_tokens if token not in stop_words]
    # Join tokens back into a single string
    clean_text = ' '.join(clean_tokens)
    return clean_text


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dchur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dchur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#import training data
query = "SELECT `text`,`label` FROM `Tutorial`"
data = get_sql_data(query)
query = "SELECT `text` FROM `Articles` WHERE `source` LIKE '%New York Times%'"
nyt_data = get_sql_data(query)
#the nyt is highly trusted, it's label is always real
nyt_data['label'] = 1


#import testing data
query = "SELECT `text`,`label` FROM `Kaggle`"
kaggle = get_sql_data(query)
#sample for testing
#data = data.sample(1000)
# Invert the label column for Kaggle set
kaggle['label'] = kaggle['label'].map({0: 1, 1: 0})


#concat the nyt articles
data = pd.concat([data, nyt_data, kaggle], ignore_index=True)
#sample for testing
#data = data.sample(1000)

#split into test and train
data_train, data_test = train_test_split(data, test_size = 0.2)

  data = pd.read_sql(query, conn)
  data = pd.read_sql(query, conn)


In [8]:
train_pipeline(data_train)

In [9]:
test_pipeline(data_test)

0.9563144000681257

In [10]:
#test_pipeline(data)