# Importing Libraries

In [3]:
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split


def load_dataset():
    """
    Loads the fake news dataset and splits it into training and testing sets.
    Returns the training and testing sets.
    """
    # Load the dataset
    df = pd.read_csv('train.csv').dropna()

    # Split the dataset into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

    return x_train, x_test, y_train, y_test



def train_ensemble_model(x_train, x_test, y_train, y_test):
    """
    Trains an ensemble model and saves it to a file.
    Returns the trained ensemble model.
    """
    # Define the models to include in the ensemble
    pac = PassiveAggressiveClassifier(max_iter=1000)
    mult_nb = MultinomialNB()
    rfc = RandomForestClassifier(n_estimators=10, random_state=7)

    # Train the models
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
    tfidf_train = tfidf_vectorizer.fit_transform(x_train)
    tfidf_test = tfidf_vectorizer.transform(x_test)
    
    pac.fit(tfidf_train, y_train)
    y_pred=pac.predict(tfidf_test)
    score=accuracy_score(y_test,y_pred)
    print(f'Accuracy of PAC : {round(score*100,2)}%')
    print("Confusion Matrix:",confusion_matrix(y_test,y_pred))#, labels=['0','1'])
    
    mult_nb.fit(tfidf_train, y_train)
    MultNB_pred=mult_nb.predict(tfidf_test)
    score1=accuracy_score(y_test,MultNB_pred)
    print(f'Multinomial Naive Bayes Accuracy: {round(score1*100,2)}%')
    print("Confusion Matrix:",confusion_matrix(y_test,MultNB_pred))#, labels=['FAKE','REAL'])
    
    
    rfc.fit(tfidf_train, y_train)
    rfc_pred = rfc.predict(tfidf_test)
    score2=accuracy_score(y_test,rfc_pred)
    print(f'Random Forest Accuracy: {round(score2*100,2)}%')
    print("Confusion Matrix:",confusion_matrix(y_test,rfc_pred))#, labels=['FAKE','REAL'])
    

    # Create the ensemble and save it to a file
    ensemble = VotingClassifier(estimators=[('PAC', pac), ('MultNB', mult_nb), ('RFC', rfc)], voting='hard')
    ensemble.fit(tfidf_train, y_train)
    filename = 'finalized_ensemble_model.sav'
    pickle.dump(ensemble, open(filename, 'wb'))

    # Compute the accuracy of the ensemble model
    ensemble_pred = ensemble.predict(tfidf_test)
    ensemble_score = accuracy_score(y_test, ensemble_pred)
    print(f'Ensemble Learning Accuracy: {round(ensemble_score * 100, 2)}%')

    return ensemble, tfidf_vectorizer


def main():
    # Load the dataset
    data = pd.read_csv('train.csv').dropna()
    X=data.text
    Y=data.label

    x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=7)# load_dataset()

    # Try to load the trained ensemble model from file
    try:
        filename = 'finalized_ensemble_model.sav'
        loaded_model = pickle.load(open(filename, 'rb'))
        tfidf_vectorizer = pickle.load(open('tfidf_vectorizer.sav', 'rb'))
    except FileNotFoundError:
        # If the file doesn't exist, train a new ensemble model and save it to a file
        loaded_model, tfidf_vectorizer = train_ensemble_model(x_train, x_test, y_train, y_test)
        pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.sav', 'wb'))

    # Use the trained ensemble model to predict the label of a new sample
    text = str(input("Enter news content: "))
    text_vectorized = tfidf_vectorizer.transform([text])
    prediction = loaded_model.predict(text_vectorized)

    if prediction == 0:
        print("True news")
    else:
        print("Fake news")


if __name__ == '__main__':
    main()


Accuracy of PAC : 96.55%
Confusion Matrix: [[2018   59]
 [  67 1513]]
Multinomial Naive Bayes Accuracy: 77.96%
Confusion Matrix: [[2073    4]
 [ 802  778]]
Random Forest Accuracy: 85.89%
Confusion Matrix: [[1970  107]
 [ 409 1171]]
Ensemble Learning Accuracy: 89.99%
Enter news content: This is shitttttt@@!!!
Fake news
